diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index ecd21f9d02095..b4392cb4fbc69 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -7,6 +7,7 @@ ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16, -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX13 %s define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; GCN-LABEL: s_mul_i16: @@ -29,6 +30,11 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_mul_i32 s0, s0, s1 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_mul_i32 s0, s0, s1 +; GFX13-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -88,6 +94,16 @@ define i16 @v_mul_i16(i16 %num, i16 %den) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -131,6 +147,13 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_i16_zeroext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_mul_i32 s0, s0, s1 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX13-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -196,6 +219,18 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_i16_zeroext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -227,6 +262,13 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_sext_i32_i16 s0, s0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_i16_signext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_mul_i32 s0, s0, s1 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_sext_i32_i16 s0, s0 +; GFX13-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result } @@ -296,6 +338,18 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_i16_signext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = mul i16 %num, %den ret i16 %result } @@ -321,6 +375,11 @@ define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) { ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_mul_i32 s0, s0, s1 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_mul_i32 s0, s0, s1 +; GFX13-NEXT: ; return to shader part epilog %result = mul i32 %num, %den ret i32 %result } @@ -354,6 +413,16 @@ define i32 @v_mul_i32(i32 %num, i32 %den) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = mul i32 %num, %den ret i32 %result } @@ -433,6 +502,17 @@ define amdgpu_ps <2 x i16> @s_mul_v2i16(<2 x i16> inreg %num, <2 x i16> inreg %d ; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 ; GFX1250-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_v2i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: v_pk_mul_lo_u16 v0, s0, s1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: v_readfirstlane_b32 s0, v0 +; GFX13-NEXT: s_lshr_b32 s1, s0, 16 +; GFX13-NEXT: s_add_co_i32 s0, s0, s0 +; GFX13-NEXT: s_add_co_i32 s1, s1, s1 +; GFX13-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX13-NEXT: ; return to shader part epilog %mul = mul <2 x i16> %num, %den %result = add <2 x i16> %mul, %mul ret <2 x i16> %result @@ -490,6 +570,16 @@ define <2 x i16> @v_mul_v2i16(<2 x i16> %num, <2 x i16> %den) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_v2i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = mul <2 x i16> %num, %den ret <2 x i16> %result } @@ -519,6 +609,12 @@ define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %d ; GFX1250-NEXT: s_mul_i32 s0, s0, s2 ; GFX1250-NEXT: s_mul_i32 s1, s1, s3 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_v2i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_mul_i32 s0, s0, s2 +; GFX13-NEXT: s_mul_i32 s1, s1, s3 +; GFX13-NEXT: ; return to shader part epilog %result = mul <2 x i32> %num, %den ret <2 x i32> %result } @@ -556,6 +652,17 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) { ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_v2i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX13-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = mul <2 x i32> %num, %den ret <2 x i32> %result } @@ -618,6 +725,11 @@ define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) { ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_i33: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] +; GFX13-NEXT: ; return to shader part epilog %result = mul i33 %num, %den ret i33 %result } @@ -680,6 +792,11 @@ define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) { ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 ; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] +; GFX13-NEXT: ; return to shader part epilog %result = mul i64 %num, %den ret i64 %result } @@ -737,6 +854,21 @@ define i64 @v_mul_i64(i64 %num, i64 %den) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX13-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX13-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_add3_u32 v1, v3, v1, v4 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = mul i64 %num, %den ret i64 %result } @@ -874,6 +1006,26 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) { ; GFX1250-NEXT: s_add_co_ci_u32 s2, s3, s0 ; GFX1250-NEXT: s_mov_b32 s0, s5 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_i96: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_mul_i32 s6, s0, s5 +; GFX13-NEXT: s_mul_i32 s7, s1, s4 +; GFX13-NEXT: s_mul_i32 s2, s2, s3 +; GFX13-NEXT: s_add_co_i32 s6, s6, s7 +; GFX13-NEXT: s_mul_hi_u32 s7, s0, s3 +; GFX13-NEXT: s_add_co_i32 s6, s6, s2 +; GFX13-NEXT: s_mul_i32 s2, s0, s4 +; GFX13-NEXT: s_mul_i32 s5, s0, s3 +; GFX13-NEXT: s_mul_hi_u32 s0, s0, s4 +; GFX13-NEXT: s_add_co_u32 s2, s2, s7 +; GFX13-NEXT: s_mul_i32 s4, s1, s3 +; GFX13-NEXT: s_add_co_ci_u32 s0, s0, s6 +; GFX13-NEXT: s_mul_hi_u32 s3, s1, s3 +; GFX13-NEXT: s_add_co_u32 s1, s4, s2 +; GFX13-NEXT: s_add_co_ci_u32 s2, s3, s0 +; GFX13-NEXT: s_mov_b32 s0, s5 +; GFX13-NEXT: ; return to shader part epilog %result = mul i96 %num, %den %cast = bitcast i96 %result to <3 x i32> ret <3 x i32> %cast @@ -960,6 +1112,26 @@ define i96 @v_mul_i96(i96 %num, i96 %den) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_i96: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 +; GFX13-NEXT: v_mov_b32_e32 v8, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v0, v6, v5 +; GFX13-NEXT: v_mad_co_u64_u32 v[9:10], null, v7, v4, v[0:1] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v8, 0 +; GFX13-NEXT: v_mad_co_u64_u32 v[2:3], null, v2, v8, v[9:10] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[9:10], null, v6, v4, v[1:2] +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v8, v[9:10] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = mul i96 %num, %den ret i96 %result } @@ -1206,6 +1378,42 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) { ; GFX1250-NEXT: s_mov_b32 s1, s8 ; GFX1250-NEXT: s_mov_b32 s2, s7 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_i128: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_mul_i32 s9, s0, s6 +; GFX13-NEXT: s_mul_i32 s11, s1, s5 +; GFX13-NEXT: s_mul_hi_u32 s10, s0, s6 +; GFX13-NEXT: s_mul_hi_u32 s12, s1, s5 +; GFX13-NEXT: s_add_co_u32 s9, s11, s9 +; GFX13-NEXT: s_mul_i32 s11, s2, s4 +; GFX13-NEXT: s_add_co_ci_u32 s10, s12, s10 +; GFX13-NEXT: s_mul_hi_u32 s12, s2, s4 +; GFX13-NEXT: s_mul_hi_u32 s8, s0, s4 +; GFX13-NEXT: s_add_co_u32 s9, s11, s9 +; GFX13-NEXT: s_mul_i32 s11, s0, s5 +; GFX13-NEXT: s_add_co_ci_u32 s10, s12, s10 +; GFX13-NEXT: s_mul_hi_u32 s12, s0, s5 +; GFX13-NEXT: s_add_co_u32 s8, s11, s8 +; GFX13-NEXT: s_add_co_ci_u32 s9, s12, s9 +; GFX13-NEXT: s_mul_i32 s12, s1, s4 +; GFX13-NEXT: s_mul_hi_u32 s13, s1, s4 +; GFX13-NEXT: s_cselect_b32 s11, 1, 0 +; GFX13-NEXT: s_add_co_u32 s8, s12, s8 +; GFX13-NEXT: s_mul_i32 s12, s0, s7 +; GFX13-NEXT: s_add_co_ci_u32 s7, s13, s9 +; GFX13-NEXT: s_add_co_ci_u32 s9, s10, s12 +; GFX13-NEXT: s_mul_i32 s1, s1, s6 +; GFX13-NEXT: s_cmp_lg_u32 s11, 0 +; GFX13-NEXT: s_mul_i32 s2, s2, s5 +; GFX13-NEXT: s_add_co_ci_u32 s1, s9, s1 +; GFX13-NEXT: s_mul_i32 s3, s3, s4 +; GFX13-NEXT: s_add_co_i32 s1, s1, s2 +; GFX13-NEXT: s_mul_i32 s0, s0, s4 +; GFX13-NEXT: s_add_co_i32 s3, s1, s3 +; GFX13-NEXT: s_mov_b32 s1, s8 +; GFX13-NEXT: s_mov_b32 s2, s7 +; GFX13-NEXT: ; return to shader part epilog %result = mul i128 %num, %den %cast = bitcast i128 %result to <4 x i32> ret <4 x i32> %cast @@ -1377,6 +1585,34 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX1250-NEXT: v_mad_u32 v3, v3, v4, v1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v6 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_i128: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1 +; GFX13-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0 +; GFX13-NEXT: v_mul_lo_u32 v7, v8, v7 +; GFX13-NEXT: v_mul_lo_u32 v6, v9, v6 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mad_co_u64_u32 v[12:13], null, v9, v5, v[0:1] +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 +; GFX13-NEXT: v_mad_co_u64_u32 v[2:3], null, v10, v4, v[12:13] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2] +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[12:13] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s0 +; GFX13-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[3:4] +; GFX13-NEXT: v_mad_co_u64_u32 v[3:4], null, v11, v4, v[5:6] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = mul i128 %num, %den ret i128 %result } @@ -2589,6 +2825,193 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX1250-NEXT: s_add_co_i32 s7, s1, s7 ; GFX1250-NEXT: s_mov_b32 s1, s16 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_i256: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_mul_i32 s17, s0, s10 +; GFX13-NEXT: s_mul_i32 s19, s1, s9 +; GFX13-NEXT: s_mul_hi_u32 s18, s0, s10 +; GFX13-NEXT: s_mul_hi_u32 s20, s1, s9 +; GFX13-NEXT: s_add_co_u32 s17, s19, s17 +; GFX13-NEXT: s_add_co_ci_u32 s18, s20, s18 +; GFX13-NEXT: s_mul_i32 s20, s2, s8 +; GFX13-NEXT: s_mul_hi_u32 s21, s2, s8 +; GFX13-NEXT: s_cselect_b32 s19, 1, 0 +; GFX13-NEXT: s_add_co_u32 s17, s20, s17 +; GFX13-NEXT: s_mul_hi_u32 s16, s0, s8 +; GFX13-NEXT: s_add_co_ci_u32 s18, s21, s18 +; GFX13-NEXT: s_mul_i32 s21, s0, s9 +; GFX13-NEXT: s_mul_hi_u32 s22, s0, s9 +; GFX13-NEXT: s_cselect_b32 s20, 1, 0 +; GFX13-NEXT: s_add_co_u32 s16, s21, s16 +; GFX13-NEXT: s_add_co_ci_u32 s17, s22, s17 +; GFX13-NEXT: s_mul_i32 s22, s1, s8 +; GFX13-NEXT: s_mul_hi_u32 s23, s1, s8 +; GFX13-NEXT: s_cselect_b32 s21, 1, 0 +; GFX13-NEXT: s_add_co_u32 s16, s22, s16 +; GFX13-NEXT: s_add_co_ci_u32 s17, s23, s17 +; GFX13-NEXT: s_mul_i32 s23, s0, s12 +; GFX13-NEXT: s_mul_i32 s25, s1, s11 +; GFX13-NEXT: s_mul_hi_u32 s24, s0, s12 +; GFX13-NEXT: s_mul_hi_u32 s26, s1, s11 +; GFX13-NEXT: s_cselect_b32 s22, 1, 0 +; GFX13-NEXT: s_add_co_u32 s23, s25, s23 +; GFX13-NEXT: s_add_co_ci_u32 s24, s26, s24 +; GFX13-NEXT: s_mul_i32 s26, s2, s10 +; GFX13-NEXT: s_mul_hi_u32 s27, s2, s10 +; GFX13-NEXT: s_cselect_b32 s25, 1, 0 +; GFX13-NEXT: s_add_co_u32 s23, s26, s23 +; GFX13-NEXT: s_add_co_ci_u32 s24, s27, s24 +; GFX13-NEXT: s_mul_i32 s27, s3, s9 +; GFX13-NEXT: s_mul_hi_u32 s28, s3, s9 +; GFX13-NEXT: s_cselect_b32 s26, 1, 0 +; GFX13-NEXT: s_add_co_u32 s23, s27, s23 +; GFX13-NEXT: s_add_co_ci_u32 s24, s28, s24 +; GFX13-NEXT: s_mul_i32 s28, s4, s8 +; GFX13-NEXT: s_mul_hi_u32 s29, s4, s8 +; GFX13-NEXT: s_cselect_b32 s27, 1, 0 +; GFX13-NEXT: s_add_co_u32 s23, s28, s23 +; GFX13-NEXT: s_add_co_ci_u32 s24, s29, s24 +; GFX13-NEXT: s_mul_i32 s29, s0, s11 +; GFX13-NEXT: s_mul_hi_u32 s30, s0, s11 +; GFX13-NEXT: s_cselect_b32 s28, 1, 0 +; GFX13-NEXT: s_add_co_u32 s18, s29, s18 +; GFX13-NEXT: s_add_co_ci_u32 s23, s30, s23 +; GFX13-NEXT: s_mul_i32 s30, s1, s10 +; GFX13-NEXT: s_mul_hi_u32 s31, s1, s10 +; GFX13-NEXT: s_cselect_b32 s29, 1, 0 +; GFX13-NEXT: s_add_co_u32 s18, s30, s18 +; GFX13-NEXT: s_add_co_ci_u32 s23, s31, s23 +; GFX13-NEXT: s_mul_i32 s31, s2, s9 +; GFX13-NEXT: s_mul_hi_u32 s33, s2, s9 +; GFX13-NEXT: s_cselect_b32 s30, 1, 0 +; GFX13-NEXT: s_add_co_u32 s18, s31, s18 +; GFX13-NEXT: s_add_co_ci_u32 s23, s33, s23 +; GFX13-NEXT: s_mul_i32 s33, s3, s8 +; GFX13-NEXT: s_mul_hi_u32 s34, s3, s8 +; GFX13-NEXT: s_cselect_b32 s31, 1, 0 +; GFX13-NEXT: s_add_co_u32 s18, s33, s18 +; GFX13-NEXT: s_add_co_ci_u32 s23, s34, s23 +; GFX13-NEXT: s_cselect_b32 s33, 1, 0 +; GFX13-NEXT: s_cmp_lg_u32 s21, 0 +; GFX13-NEXT: s_mul_hi_u32 s34, s1, s13 +; GFX13-NEXT: s_cselect_b32 s21, 1, 0 +; GFX13-NEXT: s_cmp_lg_u32 s22, 0 +; GFX13-NEXT: s_mul_hi_u32 s22, s0, s14 +; GFX13-NEXT: s_add_co_ci_u32 s18, s21, s18 +; GFX13-NEXT: s_cselect_b32 s21, 1, 0 +; GFX13-NEXT: s_cmp_lg_u32 s19, 0 +; GFX13-NEXT: s_mul_hi_u32 s35, s1, s12 +; GFX13-NEXT: s_cselect_b32 s19, 1, 0 +; GFX13-NEXT: s_cmp_lg_u32 s20, 0 +; GFX13-NEXT: s_mul_hi_u32 s36, s2, s11 +; GFX13-NEXT: s_add_co_ci_u32 s19, s19, 0 +; GFX13-NEXT: s_cmp_lg_u32 s21, 0 +; GFX13-NEXT: s_mul_i32 s21, s0, s14 +; GFX13-NEXT: s_add_co_ci_u32 s19, s19, s23 +; GFX13-NEXT: s_mul_i32 s23, s1, s13 +; GFX13-NEXT: s_cselect_b32 s20, 1, 0 +; GFX13-NEXT: s_add_co_u32 s21, s23, s21 +; GFX13-NEXT: s_mul_i32 s23, s2, s12 +; GFX13-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX13-NEXT: s_mul_hi_u32 s34, s2, s12 +; GFX13-NEXT: s_add_co_u32 s21, s23, s21 +; GFX13-NEXT: s_mul_i32 s23, s3, s11 +; GFX13-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX13-NEXT: s_mul_hi_u32 s34, s3, s11 +; GFX13-NEXT: s_add_co_u32 s21, s23, s21 +; GFX13-NEXT: s_mul_i32 s23, s4, s10 +; GFX13-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX13-NEXT: s_mul_hi_u32 s34, s4, s10 +; GFX13-NEXT: s_add_co_u32 s21, s23, s21 +; GFX13-NEXT: s_mul_i32 s23, s5, s9 +; GFX13-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX13-NEXT: s_mul_hi_u32 s34, s5, s9 +; GFX13-NEXT: s_add_co_u32 s21, s23, s21 +; GFX13-NEXT: s_mul_i32 s23, s6, s8 +; GFX13-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX13-NEXT: s_mul_hi_u32 s34, s6, s8 +; GFX13-NEXT: s_add_co_u32 s21, s23, s21 +; GFX13-NEXT: s_mul_i32 s23, s0, s13 +; GFX13-NEXT: s_add_co_ci_u32 s22, s34, s22 +; GFX13-NEXT: s_mul_hi_u32 s34, s0, s13 +; GFX13-NEXT: s_add_co_u32 s23, s23, s24 +; GFX13-NEXT: s_add_co_ci_u32 s21, s34, s21 +; GFX13-NEXT: s_mul_i32 s34, s1, s12 +; GFX13-NEXT: s_cselect_b32 s24, 1, 0 +; GFX13-NEXT: s_add_co_u32 s23, s34, s23 +; GFX13-NEXT: s_add_co_ci_u32 s21, s35, s21 +; GFX13-NEXT: s_mul_i32 s35, s2, s11 +; GFX13-NEXT: s_cselect_b32 s34, 1, 0 +; GFX13-NEXT: s_add_co_u32 s23, s35, s23 +; GFX13-NEXT: s_add_co_ci_u32 s21, s36, s21 +; GFX13-NEXT: s_mul_i32 s36, s3, s10 +; GFX13-NEXT: s_mul_hi_u32 s37, s3, s10 +; GFX13-NEXT: s_cselect_b32 s35, 1, 0 +; GFX13-NEXT: s_add_co_u32 s23, s36, s23 +; GFX13-NEXT: s_add_co_ci_u32 s21, s37, s21 +; GFX13-NEXT: s_mul_i32 s37, s4, s9 +; GFX13-NEXT: s_mul_hi_u32 s38, s4, s9 +; GFX13-NEXT: s_cselect_b32 s36, 1, 0 +; GFX13-NEXT: s_add_co_u32 s23, s37, s23 +; GFX13-NEXT: s_add_co_ci_u32 s21, s38, s21 +; GFX13-NEXT: s_mul_i32 s38, s5, s8 +; GFX13-NEXT: s_mul_hi_u32 s39, s5, s8 +; GFX13-NEXT: s_cselect_b32 s37, 1, 0 +; GFX13-NEXT: s_add_co_u32 s23, s38, s23 +; GFX13-NEXT: s_add_co_ci_u32 s21, s39, s21 +; GFX13-NEXT: s_cselect_b32 s38, 1, 0 +; GFX13-NEXT: s_cmp_lg_u32 s29, 0 +; GFX13-NEXT: s_mul_i32 s1, s1, s14 +; GFX13-NEXT: s_cselect_b32 s29, 1, 0 +; GFX13-NEXT: s_cmp_lg_u32 s30, 0 +; GFX13-NEXT: s_mul_i32 s2, s2, s13 +; GFX13-NEXT: s_add_co_ci_u32 s29, s29, 0 +; GFX13-NEXT: s_cmp_lg_u32 s31, 0 +; GFX13-NEXT: s_mul_i32 s3, s3, s12 +; GFX13-NEXT: s_add_co_ci_u32 s29, s29, 0 +; GFX13-NEXT: s_cmp_lg_u32 s33, 0 +; GFX13-NEXT: s_mul_i32 s4, s4, s11 +; GFX13-NEXT: s_add_co_ci_u32 s29, s29, 0 +; GFX13-NEXT: s_cmp_lg_u32 s20, 0 +; GFX13-NEXT: s_mul_i32 s5, s5, s10 +; GFX13-NEXT: s_add_co_ci_u32 s20, s29, s23 +; GFX13-NEXT: s_cselect_b32 s23, 1, 0 +; GFX13-NEXT: s_cmp_lg_u32 s25, 0 +; GFX13-NEXT: s_mul_i32 s6, s6, s9 +; GFX13-NEXT: s_cselect_b32 s25, 1, 0 +; GFX13-NEXT: s_cmp_lg_u32 s26, 0 +; GFX13-NEXT: s_mul_i32 s26, s0, s15 +; GFX13-NEXT: s_add_co_ci_u32 s25, s25, 0 +; GFX13-NEXT: s_cmp_lg_u32 s27, 0 +; GFX13-NEXT: s_mul_i32 s7, s7, s8 +; GFX13-NEXT: s_add_co_ci_u32 s25, s25, 0 +; GFX13-NEXT: s_cmp_lg_u32 s28, 0 +; GFX13-NEXT: s_mul_i32 s0, s0, s8 +; GFX13-NEXT: s_add_co_ci_u32 s25, s25, 0 +; GFX13-NEXT: s_cmp_lg_u32 s23, 0 +; GFX13-NEXT: s_add_co_ci_u32 s15, s25, s21 +; GFX13-NEXT: s_add_co_ci_u32 s21, s22, s26 +; GFX13-NEXT: s_cmp_lg_u32 s38, 0 +; GFX13-NEXT: s_add_co_ci_u32 s1, s21, s1 +; GFX13-NEXT: s_cmp_lg_u32 s37, 0 +; GFX13-NEXT: s_add_co_ci_u32 s1, s1, s2 +; GFX13-NEXT: s_cmp_lg_u32 s36, 0 +; GFX13-NEXT: s_mov_b32 s2, s17 +; GFX13-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GFX13-NEXT: s_cmp_lg_u32 s35, 0 +; GFX13-NEXT: s_mov_b32 s3, s18 +; GFX13-NEXT: s_add_co_ci_u32 s1, s1, s4 +; GFX13-NEXT: s_cmp_lg_u32 s34, 0 +; GFX13-NEXT: s_mov_b32 s4, s19 +; GFX13-NEXT: s_add_co_ci_u32 s1, s1, s5 +; GFX13-NEXT: s_cmp_lg_u32 s24, 0 +; GFX13-NEXT: s_mov_b32 s5, s20 +; GFX13-NEXT: s_add_co_ci_u32 s1, s1, s6 +; GFX13-NEXT: s_mov_b32 s6, s15 +; GFX13-NEXT: s_add_co_i32 s7, s1, s7 +; GFX13-NEXT: s_mov_b32 s1, s16 +; GFX13-NEXT: ; return to shader part epilog %result = mul i256 %num, %den %cast = bitcast i256 %result to <8 x i32> ret <8 x i32> %cast @@ -3130,6 +3553,96 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v12 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_i256: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1 +; GFX13-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3 +; GFX13-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v21, v5 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, v16, v14, 0 +; GFX13-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v22, v7 +; GFX13-NEXT: v_mad_co_u64_u32 v[25:26], null, v16, v10, 0 +; GFX13-NEXT: v_mul_lo_u32 v29, v20, v11 +; GFX13-NEXT: v_mul_lo_u32 v30, v19, v12 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mul_lo_u32 v27, v0, v9 +; GFX13-NEXT: v_mad_co_u64_u32 v[3:4], null, v17, v13, v[1:2] +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, v16, v12, 0 +; GFX13-NEXT: v_mad_co_u64_u32 v[5:6], null, v18, v12, v[3:4] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[3:4], s0, v17, v11, v[1:2] +; GFX13-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, v19, v11, v[5:6] +; GFX13-NEXT: v_mad_co_u64_u32 v[5:6], vcc_lo, v18, v10, v[3:4] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo +; GFX13-NEXT: v_mad_co_u64_u32 v[3:4], null, v20, v10, v[1:2] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v19, v9, v[5:6] +; GFX13-NEXT: v_mad_co_u64_u32 v[23:24], null, v21, v9, v[3:4] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v7, vcc_lo +; GFX13-NEXT: v_mad_co_u64_u32 v[4:5], vcc_lo, v20, v8, v[1:2] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX13-NEXT: v_mad_co_u64_u32 v[6:7], null, v0, v8, v[23:24] +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[25:26] +; GFX13-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v3, vcc_lo +; GFX13-NEXT: v_mul_lo_u32 v26, v21, v10 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[23:24], vcc_lo, v16, v13, v[5:6] +; GFX13-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX13-NEXT: v_mad_co_u64_u32 v[2:3], s0, v18, v8, v[0:1] +; GFX13-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v5, s0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], s1, v17, v12, v[23:24] +; GFX13-NEXT: v_mad_co_u64_u32 v[5:6], s2, v16, v11, v[3:4] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX13-NEXT: v_mad_co_u64_u32 v[3:4], s0, v18, v11, v[0:1] +; GFX13-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2 +; GFX13-NEXT: v_mad_co_u64_u32 v[23:24], s2, v17, v10, v[5:6] +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX13-NEXT: v_add_co_ci_u32_e64 v12, null, 0, v11, s2 +; GFX13-NEXT: v_mad_co_u64_u32 v[5:6], s3, v19, v10, v[3:4] +; GFX13-NEXT: v_mad_co_u64_u32 v[3:4], s2, v18, v9, v[23:24] +; GFX13-NEXT: v_mul_lo_u32 v18, v18, v13 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX13-NEXT: v_mad_co_u64_u32 v[10:11], s4, v20, v9, v[5:6] +; GFX13-NEXT: v_mad_co_u64_u32 v[5:6], s5, v16, v9, v[1:2] +; GFX13-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v12, s2 +; GFX13-NEXT: v_mad_co_u64_u32 v[12:13], s2, v19, v8, v[3:4] +; GFX13-NEXT: v_cndmask_b32_e64 v3, 0, 1, s5 +; GFX13-NEXT: v_mul_lo_u32 v20, v16, v15 +; GFX13-NEXT: v_mul_lo_u32 v9, v17, v14 +; GFX13-NEXT: v_mad_co_u64_u32 v[14:15], s5, v21, v8, v[10:11] +; GFX13-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v1, s2 +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], s2, v17, v8, v[5:6] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2 +; GFX13-NEXT: v_add_co_ci_u32_e64 v4, s2, v28, v13, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_ci_u32_e64 v5, s2, v10, v14, s2 +; GFX13-NEXT: v_add_co_ci_u32_e64 v6, s2, v25, v15, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v20, s2 +; GFX13-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v9, s5 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v18, s4 +; GFX13-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s0 +; GFX13-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v26, s1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v27, vcc_lo +; GFX13-NEXT: v_mad_co_u64_u32 v[7:8], null, v22, v8, v[7:8] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = mul i256 %num, %den ret i256 %result } @@ -3197,6 +3710,14 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: s_mul_u64_zext_with_vregs: +; GFX13: ; %bb.0: +; GFX13-NEXT: global_load_b32 v4, v[2:3], off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0 +; GFX13-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = zext i32 %val to i64 %mul = mul i64 %ext, 80 @@ -3309,6 +3830,20 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: s_mul_u64_zext_with_sregs: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX13-NEXT: s_mov_b32 s3, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = zext i32 %val to i64 %mul = mul i64 %ext, 80 @@ -3393,6 +3928,14 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr ; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0 ; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: s_mul_u64_sext_with_vregs: +; GFX13: ; %bb.0: +; GFX13-NEXT: global_load_b32 v4, v[2:3], off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0 +; GFX13-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = sext i32 %val to i64 %mul = mul i64 %ext, 80 @@ -3519,6 +4062,20 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: s_mul_u64_sext_with_sregs: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_ashr_i32 s3, s2, 31 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50 +; GFX13-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %in, align 4 %ext = sext i32 %val to i64 %mul = mul i64 %ext, 80 diff --git a/llvm/test/CodeGen/AMDGPU/add_u64.ll b/llvm/test/CodeGen/AMDGPU/add_u64.ll index 7967e3ea1aeed..acab7d6e41950 100644 --- a/llvm/test/CodeGen/AMDGPU/add_u64.ll +++ b/llvm/test/CodeGen/AMDGPU/add_u64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GCN,GFX12 %s define amdgpu_ps <2 x float> @test_add_u64_vv(i64 %a, i64 %b) { ; GFX12-LABEL: test_add_u64_vv: diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 0193feec27c86..0b2452cf11798 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -10,6 +10,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1030W64 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -enable-var-scope -check-prefixes=GFX13 %s define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { @@ -124,6 +125,19 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: sadd64rr: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: sadd64rr ; GCN-ISEL: bb.0.entry: ; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5 @@ -254,6 +268,17 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: sadd64ri: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0x123456789876 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: sadd64ri ; GCN-ISEL: bb.0.entry: ; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5 @@ -375,6 +400,18 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) { ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1] ; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: vadd64rr: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_u32 v0, s2, s2, v0 +; GFX13-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: vadd64rr ; GCN-ISEL: bb.0.entry: ; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 @@ -495,6 +532,18 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: vadd64ri: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0 +; GFX13-NEXT: v_add_co_ci_u32_e64 v1, null, 0x1234, 0, s2 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: vadd64ri ; GCN-ISEL: bb.0.entry: ; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 @@ -619,6 +668,18 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: suaddo32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_add_co_i32 s0, s0, s1 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX13-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: suaddo32 ; GCN-ISEL: bb.0 (%ir-block.0): ; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5 @@ -767,6 +828,21 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: uaddo32_vcc_user: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 nv +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_co_u32 v1, s4, s6, s7 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: uaddo32_vcc_user ; GCN-ISEL: bb.0 (%ir-block.0): ; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5 @@ -936,6 +1012,21 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9] ; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: suaddo64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_add_co_u32 s4, s4, s6 +; GFX13-NEXT: s_add_co_ci_u32 s5, s5, s7 +; GFX13-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s4 +; GFX13-NEXT: s_cselect_b32 s4, -1, 0 +; GFX13-NEXT: v_mov_b32_e32 v1, s5 +; GFX13-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: suaddo64 ; GCN-ISEL: bb.0 (%ir-block.0): ; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5 @@ -1108,6 +1199,24 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: vuaddo64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 nv +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_u32 v0, s4, s6, v0 +; GFX13-NEXT: v_add_co_ci_u32_e64 v1, s4, s7, 0, s4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: vuaddo64 ; GCN-ISEL: bb.0 (%ir-block.0): ; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 @@ -1264,6 +1373,19 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: ssub64rr: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: ssub64rr ; GCN-ISEL: bb.0.entry: ; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5 @@ -1394,6 +1516,17 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: ssub64ri: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_sub_nc_u64 s[2:3], 0x123456789876, s[2:3] +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2 +; GFX13-NEXT: v_mov_b32_e32 v1, s3 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: ssub64ri ; GCN-ISEL: bb.0.entry: ; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5 @@ -1515,6 +1648,18 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) { ; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], s[2:3], v[0:1] ; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: vsub64rr: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_sub_co_u32 v0, s2, s2, v0 +; GFX13-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: vsub64rr ; GCN-ISEL: bb.0.entry: ; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 @@ -1635,6 +1780,18 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: vsub64ri: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0 +; GFX13-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: vsub64ri ; GCN-ISEL: bb.0.entry: ; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 @@ -1759,6 +1916,18 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: susubo32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 nv +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX13-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: susubo32 ; GCN-ISEL: bb.0 (%ir-block.0): ; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5 @@ -1908,6 +2077,21 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: usubo32_vcc_user: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 nv +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_mov_b32_e32 v0, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_sub_co_u32 v1, s4, s6, s7 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX13-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: usubo32_vcc_user ; GCN-ISEL: bb.0 (%ir-block.0): ; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5 @@ -2077,6 +2261,21 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9] ; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: susubo64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_sub_co_u32 s4, s4, s6 +; GFX13-NEXT: s_sub_co_ci_u32 s5, s5, s7 +; GFX13-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s4 +; GFX13-NEXT: s_cselect_b32 s4, -1, 0 +; GFX13-NEXT: v_mov_b32_e32 v1, s5 +; GFX13-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: susubo64 ; GCN-ISEL: bb.0 (%ir-block.0): ; GCN-ISEL-NEXT: liveins: $sgpr4_sgpr5 @@ -2249,6 +2448,24 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3] ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: vusubo64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 nv +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v2, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_sub_co_u32 v0, s4, s6, v0 +; GFX13-NEXT: v_sub_co_ci_u32_e64 v1, s4, s7, 0, s4 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX13-NEXT: s_endpgm ; GCN-ISEL-LABEL: name: vusubo64 ; GCN-ISEL: bb.0 (%ir-block.0): ; GCN-ISEL-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 @@ -3520,6 +3737,157 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: .LBB16_4: ; GFX1250-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1250-NEXT: s_branch .LBB16_2 +; +; GFX13-LABEL: sudiv64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_cmp_lg_u32 s7, 0 +; GFX13-NEXT: s_mov_b32 s7, 0 +; GFX13-NEXT: s_cbranch_scc0 .LBB16_4 +; GFX13-NEXT: ; %bb.1: +; GFX13-NEXT: s_cvt_f32_u32 s6, s4 +; GFX13-NEXT: s_cvt_f32_u32 s8, s5 +; GFX13-NEXT: s_sub_nc_u64 s[10:11], 0, s[4:5] +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX13-NEXT: s_fmac_f32 s6, s8, 0x4f800000 +; GFX13-NEXT: v_s_rcp_f32 s6, s6 +; GFX13-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX13-NEXT: s_mul_f32 s6, s6, 0x5f7ffffc +; GFX13-NEXT: s_mul_f32 s8, s6, 0x2f800000 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX13-NEXT: s_trunc_f32 s8, s8 +; GFX13-NEXT: s_fmac_f32 s6, s8, 0xcf800000 +; GFX13-NEXT: s_cvt_u32_f32 s9, s8 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3) +; GFX13-NEXT: s_cvt_u32_f32 s8, s6 +; GFX13-NEXT: s_mul_u64 s[12:13], s[10:11], s[8:9] +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_mul_hi_u32 s15, s8, s13 +; GFX13-NEXT: s_mul_i32 s14, s8, s13 +; GFX13-NEXT: s_mul_hi_u32 s6, s8, s12 +; GFX13-NEXT: s_mul_i32 s17, s9, s12 +; GFX13-NEXT: s_add_nc_u64 s[14:15], s[6:7], s[14:15] +; GFX13-NEXT: s_mul_hi_u32 s16, s9, s12 +; GFX13-NEXT: s_mul_hi_u32 s18, s9, s13 +; GFX13-NEXT: s_add_co_u32 s6, s14, s17 +; GFX13-NEXT: s_add_co_ci_u32 s6, s15, s16 +; GFX13-NEXT: s_mul_i32 s12, s9, s13 +; GFX13-NEXT: s_add_co_ci_u32 s13, s18, 0 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: s_add_nc_u64 s[12:13], s[6:7], s[12:13] +; GFX13-NEXT: s_add_co_u32 s8, s8, s12 +; GFX13-NEXT: s_add_co_ci_u32 s9, s9, s13 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9] +; GFX13-NEXT: s_mul_hi_u32 s13, s8, s11 +; GFX13-NEXT: s_mul_i32 s12, s8, s11 +; GFX13-NEXT: s_mul_hi_u32 s6, s8, s10 +; GFX13-NEXT: s_mul_i32 s15, s9, s10 +; GFX13-NEXT: s_add_nc_u64 s[12:13], s[6:7], s[12:13] +; GFX13-NEXT: s_mul_hi_u32 s14, s9, s10 +; GFX13-NEXT: s_mul_hi_u32 s16, s9, s11 +; GFX13-NEXT: s_add_co_u32 s6, s12, s15 +; GFX13-NEXT: s_add_co_ci_u32 s6, s13, s14 +; GFX13-NEXT: s_mul_i32 s10, s9, s11 +; GFX13-NEXT: s_add_co_ci_u32 s11, s16, 0 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: s_add_nc_u64 s[10:11], s[6:7], s[10:11] +; GFX13-NEXT: s_add_co_u32 s8, s8, s10 +; GFX13-NEXT: s_add_co_ci_u32 s10, s9, s11 +; GFX13-NEXT: s_mul_hi_u32 s6, s2, s8 +; GFX13-NEXT: s_mul_hi_u32 s11, s3, s8 +; GFX13-NEXT: s_mul_i32 s12, s3, s8 +; GFX13-NEXT: s_mul_hi_u32 s9, s2, s10 +; GFX13-NEXT: s_mul_i32 s8, s2, s10 +; GFX13-NEXT: s_mul_hi_u32 s13, s3, s10 +; GFX13-NEXT: s_add_nc_u64 s[8:9], s[6:7], s[8:9] +; GFX13-NEXT: s_mul_i32 s10, s3, s10 +; GFX13-NEXT: s_add_co_u32 s6, s8, s12 +; GFX13-NEXT: s_add_co_ci_u32 s6, s9, s11 +; GFX13-NEXT: s_add_co_ci_u32 s11, s13, 0 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: s_add_nc_u64 s[8:9], s[6:7], s[10:11] +; GFX13-NEXT: s_and_b64 s[10:11], s[8:9], 0xffffffff00000000 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: s_or_b32 s10, s10, s8 +; GFX13-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_sub_co_u32 s6, s2, s8 +; GFX13-NEXT: s_cselect_b32 s8, -1, 0 +; GFX13-NEXT: s_sub_co_i32 s12, s3, s9 +; GFX13-NEXT: s_cmp_lg_u32 s8, 0 +; GFX13-NEXT: s_sub_co_ci_u32 s12, s12, s5 +; GFX13-NEXT: s_sub_co_u32 s13, s6, s4 +; GFX13-NEXT: s_sub_co_ci_u32 s12, s12, 0 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_cmp_ge_u32 s12, s5 +; GFX13-NEXT: s_cselect_b32 s14, -1, 0 +; GFX13-NEXT: s_cmp_ge_u32 s13, s4 +; GFX13-NEXT: s_cselect_b32 s15, -1, 0 +; GFX13-NEXT: s_cmp_eq_u32 s12, s5 +; GFX13-NEXT: s_add_nc_u64 s[12:13], s[10:11], 1 +; GFX13-NEXT: s_cselect_b32 s16, s15, s14 +; GFX13-NEXT: s_add_nc_u64 s[14:15], s[10:11], 2 +; GFX13-NEXT: s_cmp_lg_u32 s16, 0 +; GFX13-NEXT: s_cselect_b32 s12, s14, s12 +; GFX13-NEXT: s_cselect_b32 s13, s15, s13 +; GFX13-NEXT: s_cmp_lg_u32 s8, 0 +; GFX13-NEXT: s_sub_co_ci_u32 s3, s3, s9 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_cmp_ge_u32 s3, s5 +; GFX13-NEXT: s_cselect_b32 s8, -1, 0 +; GFX13-NEXT: s_cmp_ge_u32 s6, s4 +; GFX13-NEXT: s_cselect_b32 s6, -1, 0 +; GFX13-NEXT: s_cmp_eq_u32 s3, s5 +; GFX13-NEXT: s_cselect_b32 s3, s6, s8 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_cmp_lg_u32 s3, 0 +; GFX13-NEXT: s_cselect_b32 s9, s13, s11 +; GFX13-NEXT: s_cselect_b32 s8, s12, s10 +; GFX13-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 +; GFX13-NEXT: s_cbranch_vccnz .LBB16_3 +; GFX13-NEXT: .LBB16_2: +; GFX13-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX13-NEXT: s_sub_co_i32 s5, 0, s4 +; GFX13-NEXT: s_mov_b32 s9, 0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX13-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX13-NEXT: v_nop +; GFX13-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX13-NEXT: v_readfirstlane_b32 s3, v0 +; GFX13-NEXT: s_mul_i32 s5, s5, s3 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX13-NEXT: s_add_co_i32 s3, s3, s5 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX13-NEXT: s_mul_hi_u32 s3, s2, s3 +; GFX13-NEXT: s_mul_i32 s5, s3, s4 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_sub_co_i32 s2, s2, s5 +; GFX13-NEXT: s_add_co_i32 s5, s3, 1 +; GFX13-NEXT: s_sub_co_i32 s6, s2, s4 +; GFX13-NEXT: s_cmp_ge_u32 s2, s4 +; GFX13-NEXT: s_cselect_b32 s3, s5, s3 +; GFX13-NEXT: s_cselect_b32 s2, s6, s2 +; GFX13-NEXT: s_add_co_i32 s5, s3, 1 +; GFX13-NEXT: s_cmp_ge_u32 s2, s4 +; GFX13-NEXT: s_cselect_b32 s8, s5, s3 +; GFX13-NEXT: .LBB16_3: ; %.split +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s8 +; GFX13-NEXT: v_mov_b32_e32 v1, s9 +; GFX13-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX13-NEXT: s_endpgm +; GFX13-NEXT: .LBB16_4: +; GFX13-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GFX13-NEXT: s_branch .LBB16_2 ; GCN-ISEL-LABEL: name: sudiv64 ; GCN-ISEL: bb.0 (%ir-block.0): ; GCN-ISEL-NEXT: successors: %bb.3(0x50000000), %bb.1(0x30000000) diff --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll index defe16e729c3b..29fd371bccd84 100644 --- a/llvm/test/CodeGen/AMDGPU/literal64.ll +++ b/llvm/test/CodeGen/AMDGPU/literal64.ll @@ -1,154 +1,289 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GCN-SDAG %s -; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GCN-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck --check-prefixes=GFX13,GFX13-SDAG %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck --check-prefixes=GFX13,GFX13-GISEL %s define amdgpu_ps i64 @s_add_u64(i64 inreg %a) { -; GCN-LABEL: s_add_u64: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0xf12345678 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: s_add_u64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0xf12345678 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_add_u64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0xf12345678 +; GFX13-NEXT: ; return to shader part epilog %result = add i64 %a, 64729929336 ret i64 %result } define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) { -; GCN-LABEL: v_add_u64: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_add_nc_u64_e32 v[0:1], 0xf12345678, v[0:1] -; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: v_add_u64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0xf12345678, v[0:1] +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: v_add_u64: +; GFX13: ; %bb.0: +; GFX13-NEXT: v_add_co_u32 v0, vcc_lo, 0x12345678, v0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_add_co_ci_u32_e64 v1, null, 15, v1, vcc_lo +; GFX13-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX13-NEXT: s_endpgm %result = add i64 %a, 64729929336 store i64 %result, ptr addrspace(1) %out, align 8 ret void } define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) { -; GCN-LABEL: s_add_neg_u64: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0xfffffff0edcba988 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: s_add_neg_u64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0xfffffff0edcba988 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_add_neg_u64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0xfffffff0edcba988 +; GFX13-NEXT: ; return to shader part epilog %result = sub i64 %a, 64729929336 ret i64 %result } define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) { -; GCN-LABEL: v_add_neg_u64: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_add_nc_u64_e32 v[0:1], 0xfffffff0edcba988, v[0:1] -; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: v_add_neg_u64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0xfffffff0edcba988, v[0:1] +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: v_add_neg_u64: +; GFX13: ; %bb.0: +; GFX13-NEXT: v_add_co_u32 v0, vcc_lo, 0xedcba988, v0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_add_co_ci_u32_e64 v1, null, -16, v1, vcc_lo +; GFX13-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX13-NEXT: s_endpgm %result = sub i64 %a, 64729929336 store i64 %result, ptr addrspace(1) %out, align 8 ret void } define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) { -; GCN-LABEL: s_sub_u64: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_sub_nc_u64 s[0:1], 0xf12345678, s[0:1] -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: s_sub_u64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_sub_nc_u64 s[0:1], 0xf12345678, s[0:1] +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_sub_u64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_sub_nc_u64 s[0:1], 0xf12345678, s[0:1] +; GFX13-NEXT: ; return to shader part epilog %result = sub i64 64729929336, %a ret i64 %result } define amdgpu_ps void @v_sub_u64(i64 %a, ptr addrspace(1) %out) { -; GCN-LABEL: v_sub_u64: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_sub_nc_u64_e32 v[0:1], 0xf12345678, v[0:1] -; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: v_sub_u64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], 0xf12345678, v[0:1] +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: v_sub_u64: +; GFX13: ; %bb.0: +; GFX13-NEXT: v_sub_co_u32 v0, vcc_lo, 0x12345678, v0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_sub_co_ci_u32_e64 v1, null, 15, v1, vcc_lo +; GFX13-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX13-NEXT: s_endpgm %result = sub i64 64729929336, %a store i64 %result, ptr addrspace(1) %out, align 8 ret void } define void @v_mov_b64_double(ptr addrspace(1) %ptr) { -; GCN-LABEL: v_mov_b64_double: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_mov_b64_e32 v[2:3], 0x4063233333333333 -; GCN-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off scope:SCOPE_SYS -; GCN-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-LABEL: v_mov_b64_double: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 0x4063233333333333 +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off scope:SCOPE_SYS +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mov_b64_double: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: global_load_b64 v[4:5], v[0:1], off +; GFX13-NEXT: s_mov_b32 s0, 0 +; GFX13-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX13-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_add_f64_e32 v[2:3], 0x4063233333333333, v[4:5] +; GFX13-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX13-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX13-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX13-NEXT: s_cbranch_execnz .LBB6_1 +; GFX13-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX13-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, double 153.1 monotonic ret void } define void @v_mov_b64_int(ptr addrspace(1) %ptr) { -; GCN-LABEL: v_mov_b64_int: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_mov_b64_e32 v[2:3], 0xf12345678 -; GCN-NEXT: global_atomic_add_u64 v[0:1], v[2:3], off scope:SCOPE_SYS -; GCN-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-LABEL: v_mov_b64_int: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 0xf12345678 +; GFX1250-NEXT: global_atomic_add_u64 v[0:1], v[2:3], off scope:SCOPE_SYS +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mov_b64_int: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_mov_b32 v2, 0x12345678 :: v_dual_mov_b32 v3, 15 +; GFX13-NEXT: global_atomic_add_u64 v[0:1], v[2:3], off scope:SCOPE_SYS +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw add ptr addrspace(1) %ptr, i64 64729929336 monotonic ret void } define void @store_double(ptr addrspace(1) %ptr) { -; GCN-LABEL: store_double: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_mov_b64_e32 v[2:3], 0x4063233333333333 -; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off -; GCN-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-LABEL: store_double: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 0x4063233333333333 +; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: store_double: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mov_b32_e32 v2, 0x33333333 +; GFX13-NEXT: v_mov_b32_e32 v3, 0x40632333 +; GFX13-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX13-NEXT: s_set_pc_i64 s[30:31] store double 153.1, ptr addrspace(1) %ptr ret void } define i1 @class_f64() noinline optnone { -; GCN-SDAG-LABEL: class_f64: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 -; GCN-SDAG-NEXT: s_mov_b32 s2, 1 -; GCN-SDAG-NEXT: s_mov_b64 s[0:1], 0x4063233333333333 -; GCN-SDAG-NEXT: v_cmp_class_f64_e64 s0, s[0:1], s2 -; GCN-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-LABEL: class_f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, 1 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0x4063233333333333 +; GFX1250-SDAG-NEXT: v_cmp_class_f64_e64 s0, s[0:1], s2 +; GFX1250-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; -; GCN-GISEL-LABEL: class_f64: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 -; GCN-GISEL-NEXT: s_mov_b32 s2, 1 -; GCN-GISEL-NEXT: s_mov_b64 s[0:1], 0x4063233333333333 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GCN-GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GCN-GISEL-NEXT: v_cmp_class_f64_e64 s0, v[0:1], v2 -; GCN-GISEL-NEXT: s_cmp_lg_u32 s0, 0 -; GCN-GISEL-NEXT: s_cselect_b32 s0, 1, 0 -; GCN-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-GISEL-LABEL: class_f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0x4063233333333333 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX1250-GISEL-NEXT: v_cmp_class_f64_e64 s0, v[0:1], v2 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-SDAG-LABEL: class_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-SDAG-NEXT: s_wait_expcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_samplecnt 0x0 +; GFX13-SDAG-NEXT: s_wait_bvhcnt 0x0 +; GFX13-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX13-SDAG-NEXT: s_mov_b32 s2, 1 +; GFX13-SDAG-NEXT: s_mov_b64 s[0:1], 0x4063233333333333 +; GFX13-SDAG-NEXT: v_cmp_class_f64_e64 s0, s[0:1], s2 +; GFX13-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX13-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-GISEL-LABEL: class_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-GISEL-NEXT: s_wait_expcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_samplecnt 0x0 +; GFX13-GISEL-NEXT: s_wait_bvhcnt 0x0 +; GFX13-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX13-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX13-GISEL-NEXT: s_mov_b64 s[0:1], 0x4063233333333333 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX13-GISEL-NEXT: v_cmp_class_f64_e64 s0, v[0:1], v2 +; GFX13-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX13-GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX13-GISEL-NEXT: s_set_pc_i64 s[30:31] %result = call i1 @llvm.amdgcn.class.f64(double 153.1, i32 1) nounwind readnone ret i1 %result } define double @rsq_f64() { -; GCN-LABEL: rsq_f64: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_rsq_f64_e32 v[0:1], 0x4063233333333333 -; GCN-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-LABEL: rsq_f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_rsq_f64_e32 v[0:1], 0x4063233333333333 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: rsq_f64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_rsq_f64_e32 v[0:1], 0x4063233333333333 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %result = call double @llvm.amdgcn.rsq.f64(double 153.1) nounwind readnone ret double %result } define amdgpu_ps i64 @s_and_b64(i64 inreg %a) { -; GCN-LABEL: s_and_b64: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], 0xf12345678 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: s_and_b64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_and_b64 s[0:1], s[0:1], 0xf12345678 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_and_b64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_and_b64 s[0:1], s[0:1], 0xf12345678 +; GFX13-NEXT: ; return to shader part epilog %result = and i64 %a, 64729929336 ret i64 %result } @@ -156,33 +291,52 @@ define amdgpu_ps i64 @s_and_b64(i64 inreg %a) { ; No V_AND_B64 instruction, it has to be split define amdgpu_ps void @v_and_b64(i64 %a, ptr addrspace(1) %out) { -; GCN-SDAG-LABEL: v_and_b64: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-SDAG-NEXT: v_and_b32_e32 v1, 15, v1 -; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0x12345678, v0 -; GCN-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off -; GCN-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-LABEL: v_and_b64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x12345678, v0 +; GFX1250-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: v_and_b64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x12345678, v0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX1250-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX13-SDAG-LABEL: v_and_b64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX13-SDAG-NEXT: v_and_b32_e32 v0, 0x12345678, v0 +; GFX13-SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX13-SDAG-NEXT: s_endpgm ; -; GCN-GISEL-LABEL: v_and_b64: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-GISEL-NEXT: v_and_b32_e32 v0, 0x12345678, v0 -; GCN-GISEL-NEXT: v_and_b32_e32 v1, 15, v1 -; GCN-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off -; GCN-GISEL-NEXT: s_endpgm +; GFX13-GISEL-LABEL: v_and_b64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_and_b32_e32 v0, 0x12345678, v0 +; GFX13-GISEL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX13-GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX13-GISEL-NEXT: s_endpgm %result = and i64 %a, 64729929336 store i64 %result, ptr addrspace(1) %out, align 8 ret void } define amdgpu_ps <2 x float> @v_add_f64_200.1(double %a) { -; GCN-LABEL: v_add_f64_200.1: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_add_f64_e32 v[0:1], 0x4069033333333333, v[0:1] -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: v_add_f64_200.1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_add_f64_e32 v[0:1], 0x4069033333333333, v[0:1] +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: v_add_f64_200.1: +; GFX13: ; %bb.0: +; GFX13-NEXT: v_add_f64_e32 v[0:1], 0x4069033333333333, v[0:1] +; GFX13-NEXT: ; return to shader part epilog %add = fadd double %a, 200.1 %ret = bitcast double %add to <2 x float> ret <2 x float> %ret @@ -191,12 +345,17 @@ define amdgpu_ps <2 x float> @v_add_f64_200.1(double %a) { ; 200.0 can be encoded as 32-bit literal define amdgpu_ps <2 x float> @v_add_f64_200.0(double %a) { -; GCN-LABEL: v_add_f64_200.0: -; GCN: ; %bb.0: -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_add_f64_e32 v[0:1], 0x40690000, v[0:1] -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: v_add_f64_200.0: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_add_f64_e32 v[0:1], 0x40690000, v[0:1] +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: v_add_f64_200.0: +; GFX13: ; %bb.0: +; GFX13-NEXT: v_add_f64_e32 v[0:1], 0x40690000, v[0:1] +; GFX13-NEXT: ; return to shader part epilog %add = fadd double %a, 200.0 %ret = bitcast double %add to <2 x float> ret <2 x float> %ret @@ -205,21 +364,29 @@ define amdgpu_ps <2 x float> @v_add_f64_200.0(double %a) { ; No folding into VOP3 define amdgpu_ps <2 x float> @v_lshl_add_u64(i64 %a) { -; GCN-SDAG-LABEL: v_lshl_add_u64: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-SDAG-NEXT: s_mov_b64 s[0:1], 0xf12345678 -; GCN-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GCN-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, s[0:1] -; GCN-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: v_lshl_add_u64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xf12345678 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, s[0:1] +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: v_lshl_add_u64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0xf12345678 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3] +; GFX1250-GISEL-NEXT: ; return to shader part epilog ; -; GCN-GISEL-LABEL: v_lshl_add_u64: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0xf12345678 -; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3] -; GCN-GISEL-NEXT: ; return to shader part epilog +; GFX13-LABEL: v_lshl_add_u64: +; GFX13: ; %bb.0: +; GFX13-NEXT: v_lshlrev_b64_e32 v[0:1], 1, v[0:1] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_u32 v0, vcc_lo, 0x12345678, v0 +; GFX13-NEXT: v_add_co_ci_u32_e64 v1, null, 15, v1, vcc_lo +; GFX13-NEXT: ; return to shader part epilog %shl = shl i64 %a, 1 %add = add i64 %shl, 64729929336 %ret = bitcast i64 %add to <2 x float> @@ -229,30 +396,56 @@ define amdgpu_ps <2 x float> @v_lshl_add_u64(i64 %a) { ; No folding into VOP2 promoted to VOP3 define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) { -; GCN-SDAG-LABEL: v_fma_f64: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-SDAG-NEXT: v_fmaak_f64 v[4:5], v[0:1], v[2:3], 0x4063233333333333 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x4069033333333333 -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-SDAG-NEXT: v_fmaak_f64 v[0:1], v[0:1], v[4:5], 0x4069033333333333 -; GCN-SDAG-NEXT: v_fmac_f64_e32 v[2:3], v[0:1], v[4:5] -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GCN-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: v_fma_f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-NEXT: v_fmaak_f64 v[4:5], v[0:1], v[2:3], 0x4063233333333333 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x4069033333333333 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_fmaak_f64 v[0:1], v[0:1], v[4:5], 0x4069033333333333 +; GFX1250-SDAG-NEXT: v_fmac_f64_e32 v[2:3], v[0:1], v[4:5] +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-SDAG-NEXT: ; return to shader part epilog ; -; GCN-GISEL-LABEL: v_fma_f64: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[4:5], 0x4063233333333333 -; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GCN-GISEL-NEXT: v_fmac_f64_e32 v[4:5], v[0:1], v[2:3] -; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0x4069033333333333 -; GCN-GISEL-NEXT: v_fmaak_f64 v[0:1], v[0:1], v[4:5], 0x4069033333333333 -; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-GISEL-NEXT: v_fmac_f64_e32 v[2:3], v[0:1], v[4:5] -; GCN-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 -; GCN-GISEL-NEXT: ; return to shader part epilog +; GFX1250-GISEL-LABEL: v_fma_f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], 0x4063233333333333 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_fmac_f64_e32 v[4:5], v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0x4069033333333333 +; GFX1250-GISEL-NEXT: v_fmaak_f64 v[0:1], v[0:1], v[4:5], 0x4069033333333333 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_fmac_f64_e32 v[2:3], v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-GISEL-NEXT: ; return to shader part epilog +; +; GFX13-SDAG-LABEL: v_fma_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_fmaak_f64 v[4:5], v[0:1], v[2:3], 0x4063233333333333 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v2, 0x33333333 +; GFX13-SDAG-NEXT: v_mov_b32_e32 v3, 0x40690333 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_fmaak_f64 v[0:1], v[0:1], v[4:5], 0x4069033333333333 +; GFX13-SDAG-NEXT: v_fmac_f64_e32 v[2:3], v[0:1], v[4:5] +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX13-SDAG-NEXT: ; return to shader part epilog +; +; GFX13-GISEL-LABEL: v_fma_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_mov_b32_e32 v4, 0x33333333 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v5, 0x40632333 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX13-GISEL-NEXT: v_fmac_f64_e32 v[4:5], v[0:1], v[2:3] +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0x33333333 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v3, 0x40690333 +; GFX13-GISEL-NEXT: v_fmaak_f64 v[0:1], v[0:1], v[4:5], 0x4069033333333333 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_fmac_f64_e32 v[2:3], v[0:1], v[4:5] +; GFX13-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX13-GISEL-NEXT: ; return to shader part epilog %r1 = call double @llvm.fma.f64(double %a, double %b, double 153.1) nounwind readnone %r2 = call double @llvm.fma.f64(double %a, double %r1, double 200.1) nounwind readnone %r3 = call double @llvm.fma.f64(double %r2, double %r1, double 200.1) nounwind readnone @@ -261,23 +454,39 @@ define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) { } define amdgpu_ps <2 x float> @v_add_neg_f64(double %a) { -; GCN-SDAG-LABEL: v_add_neg_f64: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-SDAG-NEXT: s_mov_b64 s[0:1], 0x4069033333333333 -; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-SDAG-NEXT: v_add_f64_e64 v[0:1], -v[0:1], s[0:1] -; GCN-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: v_add_neg_f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0x4069033333333333 +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-NEXT: v_add_f64_e64 v[0:1], -v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: v_add_neg_f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0x4069033333333333 +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_f64_e64 v[0:1], -v[0:1], v[2:3] +; GFX1250-GISEL-NEXT: ; return to shader part epilog ; -; GCN-GISEL-LABEL: v_add_neg_f64: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0x4069033333333333 -; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-GISEL-NEXT: v_add_f64_e64 v[0:1], -v[0:1], v[2:3] -; GCN-GISEL-NEXT: ; return to shader part epilog +; GFX13-SDAG-LABEL: v_add_neg_f64: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: s_mov_b64 s[0:1], 0x4069033333333333 +; GFX13-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-SDAG-NEXT: v_add_f64_e64 v[0:1], -v[0:1], s[0:1] +; GFX13-SDAG-NEXT: ; return to shader part epilog +; +; GFX13-GISEL-LABEL: v_add_neg_f64: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, 0x33333333 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v3, 0x40690333 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_add_f64_e64 v[0:1], -v[0:1], v[2:3] +; GFX13-GISEL-NEXT: ; return to shader part epilog %fneg = fsub double -0.0, %a %add = fadd double %fneg, 200.1 %ret = bitcast double %add to <2 x float> @@ -285,25 +494,43 @@ define amdgpu_ps <2 x float> @v_add_neg_f64(double %a) { } define amdgpu_ps <2 x float> @v_cndmask(double %a) { -; GCN-SDAG-LABEL: v_cndmask: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-SDAG-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1] -; GCN-SDAG-NEXT: v_mov_b32_e32 v1, 0x40632000 -; GCN-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x40690333, v1, vcc_lo -; GCN-SDAG-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: v_cndmask: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-SDAG-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0x40632000 +; GFX1250-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x40690333, v1, vcc_lo +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: v_cndmask: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-GISEL-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0x40690333 +; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0x40632000, vcc_lo +; GFX1250-GISEL-NEXT: ; return to shader part epilog +; +; GFX13-SDAG-LABEL: v_cndmask: +; GFX13-SDAG: ; %bb.0: +; GFX13-SDAG-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1] +; GFX13-SDAG-NEXT: v_mov_b32_e32 v1, 0x40632000 +; GFX13-SDAG-NEXT: v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x40690333, v1, vcc_lo +; GFX13-SDAG-NEXT: ; return to shader part epilog ; -; GCN-GISEL-LABEL: v_cndmask: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-GISEL-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1] -; GCN-GISEL-NEXT: v_mov_b32_e32 v1, 0x40690333 -; GCN-GISEL-NEXT: v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo -; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GCN-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0x40632000, vcc_lo -; GCN-GISEL-NEXT: ; return to shader part epilog +; GFX13-GISEL-LABEL: v_cndmask: +; GFX13-GISEL: ; %bb.0: +; GFX13-GISEL-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1] +; GFX13-GISEL-NEXT: v_mov_b32_e32 v1, 0x40690333 +; GFX13-GISEL-NEXT: v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0x40632000, vcc_lo +; GFX13-GISEL-NEXT: ; return to shader part epilog %cmp = fcmp oeq double %a, 0.0 %sel = select i1 %cmp, double 153.0, double 200.1 %ret = bitcast double %sel to <2 x float> diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index d26efd3dc2aa7..d10e4511a5c6a 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -6,6 +6,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1250 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX1250 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1310 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX13 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG %s ; mul24 and mad24 are affected @@ -145,6 +146,25 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1 ; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: test_mul_v2i32: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_mov_b32 s6, -1 +; GFX13-NEXT: s_mov_b32 s7, 0x31016000 +; GFX13-NEXT: s_mov_b32 s10, s6 +; GFX13-NEXT: s_mov_b32 s11, s7 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s8, s2 +; GFX13-NEXT: s_mov_b32 s9, s3 +; GFX13-NEXT: s_mov_b32 s4, s0 +; GFX13-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null +; GFX13-NEXT: s_mov_b32 s5, s1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX13-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: test_mul_v2i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -331,6 +351,29 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: v_mul_v4i32: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_mov_b32 s6, -1 +; GFX13-NEXT: s_mov_b32 s7, 0x31016000 +; GFX13-NEXT: s_mov_b32 s10, s6 +; GFX13-NEXT: s_mov_b32 s11, s7 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s8, s2 +; GFX13-NEXT: s_mov_b32 s9, s3 +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null +; GFX13-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16 +; GFX13-NEXT: s_mov_b32 s4, s0 +; GFX13-NEXT: s_mov_b32 s5, s1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_mul_lo_u32 v3, v3, v7 +; GFX13-NEXT: v_mul_lo_u32 v2, v2, v6 +; GFX13-NEXT: v_mul_lo_u32 v1, v1, v5 +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v4 +; GFX13-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: v_mul_v4i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] @@ -462,6 +505,19 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: s_trunc_i64_mul_to_i32: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_load_b32 s3, s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mul_i32 s2, s3, s2 +; GFX13-NEXT: s_mov_b32 s3, 0x31016000 +; GFX13-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-NEXT: s_mov_b32 s2, -1 +; GFX13-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: s_trunc_i64_mul_to_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -639,6 +695,29 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: v_trunc_i64_mul_to_i32: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-NEXT: s_mov_b32 s10, -1 +; GFX13-NEXT: s_mov_b32 s11, 0x31016000 +; GFX13-NEXT: s_mov_b32 s14, s10 +; GFX13-NEXT: s_mov_b32 s15, s11 +; GFX13-NEXT: s_mov_b32 s6, s10 +; GFX13-NEXT: s_mov_b32 s7, s11 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s12, s2 +; GFX13-NEXT: s_mov_b32 s13, s3 +; GFX13-NEXT: buffer_load_b32 v0, off, s[12:15], null +; GFX13-NEXT: buffer_load_b32 v1, off, s[4:7], null +; GFX13-NEXT: s_mov_b32 s8, s0 +; GFX13-NEXT: s_mov_b32 s9, s1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX13-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: v_trunc_i64_mul_to_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -768,6 +847,19 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: mul64_sext_c: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_ashr_i32 s3, s2, 31 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 +; GFX13-NEXT: s_mov_b32 s3, 0x31016000 +; GFX13-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX13-NEXT: s_mov_b32 s2, -1 +; GFX13-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: mul64_sext_c: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -884,6 +976,18 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: mul64_zext_c: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv +; GFX13-NEXT: s_mov_b32 s3, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 +; GFX13-NEXT: s_mov_b32 s3, 0x31016000 +; GFX13-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX13-NEXT: s_mov_b32 s2, -1 +; GFX13-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: mul64_zext_c: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] @@ -1041,6 +1145,25 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: v_mul64_sext_c: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_mov_b32 s6, -1 +; GFX13-NEXT: s_mov_b32 s7, 0x31016000 +; GFX13-NEXT: s_mov_b32 s10, s6 +; GFX13-NEXT: s_mov_b32 s11, s7 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s8, s2 +; GFX13-NEXT: s_mov_b32 s9, s3 +; GFX13-NEXT: s_mov_b32 s4, s0 +; GFX13-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX13-NEXT: s_mov_b32 s5, s1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_mul_hi_i32 v1, 0x50, v0 +; GFX13-NEXT: v_mul_lo_u32 v0, 0x50, v0 +; GFX13-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: v_mul64_sext_c: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1204,6 +1327,25 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1 ; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: v_mul64_zext_c: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_mov_b32 s6, -1 +; GFX13-NEXT: s_mov_b32 s7, 0x31016000 +; GFX13-NEXT: s_mov_b32 s10, s6 +; GFX13-NEXT: s_mov_b32 s11, s7 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s8, s2 +; GFX13-NEXT: s_mov_b32 s9, s3 +; GFX13-NEXT: s_mov_b32 s4, s0 +; GFX13-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX13-NEXT: s_mov_b32 s5, s1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_mul_hi_u32 v1, 0x50, v0 +; GFX13-NEXT: v_mul_lo_u32 v0, 0x50, v0 +; GFX13-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: v_mul64_zext_c: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1365,6 +1507,25 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad ; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: v_mul64_sext_inline_imm: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_mov_b32 s6, -1 +; GFX13-NEXT: s_mov_b32 s7, 0x31016000 +; GFX13-NEXT: s_mov_b32 s10, s6 +; GFX13-NEXT: s_mov_b32 s11, s7 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s8, s2 +; GFX13-NEXT: s_mov_b32 s9, s3 +; GFX13-NEXT: s_mov_b32 s4, s0 +; GFX13-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX13-NEXT: s_mov_b32 s5, s1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_mul_hi_i32 v1, 9, v0 +; GFX13-NEXT: v_mul_lo_u32 v0, 9, v0 +; GFX13-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: v_mul64_sext_inline_imm: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1488,6 +1649,20 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [ ; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: s_mul_i32: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_clause 0x2 +; GFX13-NEXT: s_load_b32 s2, s[4:5], 0x4c nv +; GFX13-NEXT: s_load_b32 s3, s[4:5], 0x70 nv +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mul_i32 s2, s2, s3 +; GFX13-NEXT: s_mov_b32 s3, 0x31016000 +; GFX13-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-NEXT: s_mov_b32 s2, -1 +; GFX13-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] @@ -1632,6 +1807,24 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: v_mul_i32: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_mov_b32 s6, -1 +; GFX13-NEXT: s_mov_b32 s7, 0x31016000 +; GFX13-NEXT: s_mov_b32 s10, s6 +; GFX13-NEXT: s_mov_b32 s11, s7 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s8, s2 +; GFX13-NEXT: s_mov_b32 s9, s3 +; GFX13-NEXT: s_mov_b32 s4, s0 +; GFX13-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null +; GFX13-NEXT: s_mov_b32 s5, s1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX13-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i32: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] @@ -1764,6 +1957,22 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8 ; GFX1250-NEXT: buffer_store_b8 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: s_mul_i1: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_clause 0x2 +; GFX13-NEXT: s_load_b32 s2, s[4:5], 0x4c nv +; GFX13-NEXT: s_load_b32 s3, s[4:5], 0x70 nv +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_and_b32 s2, s2, s3 +; GFX13-NEXT: s_mov_b32 s3, 0x31016000 +; GFX13-NEXT: s_and_b32 s2, s2, 1 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_mov_b32_e32 v0, s2 +; GFX13-NEXT: s_mov_b32 s2, -1 +; GFX13-NEXT: buffer_store_b8 v0, off, s[0:3], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i1: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @10, KC0[], KC1[] @@ -1946,6 +2155,28 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX1250-NEXT: buffer_store_b8 v0, off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: v_mul_i1: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_mov_b32 s6, -1 +; GFX13-NEXT: s_mov_b32 s7, 0x31016000 +; GFX13-NEXT: s_mov_b32 s10, s6 +; GFX13-NEXT: s_mov_b32 s11, s7 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s8, s2 +; GFX13-NEXT: s_mov_b32 s9, s3 +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: buffer_load_u8 v0, off, s[8:11], null +; GFX13-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4 +; GFX13-NEXT: s_mov_b32 s4, s0 +; GFX13-NEXT: s_mov_b32 s5, s1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX13-NEXT: buffer_store_b8 v0, off, s[4:7], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i1: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] @@ -2117,6 +2348,19 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: s_mul_i64: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5] +; GFX13-NEXT: s_mov_b32 s3, 0x31016000 +; GFX13-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX13-NEXT: s_mov_b32 s2, -1 +; GFX13-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i64: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] @@ -2329,6 +2573,35 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: v_mul_i64: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 nv +; GFX13-NEXT: s_mov_b32 s10, -1 +; GFX13-NEXT: s_mov_b32 s11, 0x31016000 +; GFX13-NEXT: s_mov_b32 s6, s10 +; GFX13-NEXT: s_mov_b32 s7, s11 +; GFX13-NEXT: s_mov_b32 s14, s10 +; GFX13-NEXT: s_mov_b32 s15, s11 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s12, s2 +; GFX13-NEXT: s_mov_b32 s13, s3 +; GFX13-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null +; GFX13-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null +; GFX13-NEXT: s_mov_b32 s8, s0 +; GFX13-NEXT: s_mov_b32 s9, s1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX13-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX13-NEXT: v_mul_hi_u32 v4, v0, v2 +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX13-NEXT: v_add_nc_u32_e32 v1, v1, v4 +; GFX13-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i64: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] @@ -2607,6 +2880,41 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: mul32_in_branch: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 nv +; GFX13-NEXT: s_mov_b32 s6, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_cmp_lg_u32 s0, 0 +; GFX13-NEXT: s_cbranch_scc0 .LBB15_2 +; GFX13-NEXT: ; %bb.1: ; %else +; GFX13-NEXT: s_mul_i32 s7, s0, s1 +; GFX13-NEXT: s_branch .LBB15_3 +; GFX13-NEXT: .LBB15_2: +; GFX13-NEXT: s_mov_b32 s6, -1 +; GFX13-NEXT: ; implicit-def: $sgpr7 +; GFX13-NEXT: .LBB15_3: ; %Flow +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 nv +; GFX13-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 +; GFX13-NEXT: s_cbranch_vccnz .LBB15_5 +; GFX13-NEXT: ; %bb.4: ; %if +; GFX13-NEXT: s_mov_b32 s7, 0x31016000 +; GFX13-NEXT: s_mov_b32 s6, -1 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s4, s2 +; GFX13-NEXT: s_mov_b32 s5, s3 +; GFX13-NEXT: buffer_load_b32 v0, off, s[4:7], null +; GFX13-NEXT: s_branch .LBB15_6 +; GFX13-NEXT: .LBB15_5: +; GFX13-NEXT: v_mov_b32_e32 v0, s7 +; GFX13-NEXT: .LBB15_6: ; %endif +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s3, 0x31016000 +; GFX13-NEXT: s_mov_b32 s2, -1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: buffer_store_b32 v0, off, s[0:3], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: mul32_in_branch: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU_PUSH_BEFORE 3, @14, KC0[CB0:0-32], KC1[] @@ -2889,6 +3197,34 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: mul64_in_branch: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 nv +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX13-NEXT: s_cbranch_scc0 .LBB16_3 +; GFX13-NEXT: ; %bb.1: ; %else +; GFX13-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7] +; GFX13-NEXT: s_cbranch_execnz .LBB16_4 +; GFX13-NEXT: .LBB16_2: ; %if +; GFX13-NEXT: s_mov_b32 s7, 0x31016000 +; GFX13-NEXT: s_mov_b32 s6, -1 +; GFX13-NEXT: s_mov_b32 s4, s2 +; GFX13-NEXT: s_mov_b32 s5, s3 +; GFX13-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null +; GFX13-NEXT: s_branch .LBB16_5 +; GFX13-NEXT: .LBB16_3: +; GFX13-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX13-NEXT: s_branch .LBB16_2 +; GFX13-NEXT: .LBB16_4: +; GFX13-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX13-NEXT: .LBB16_5: ; %endif +; GFX13-NEXT: s_mov_b32 s3, 0x31016000 +; GFX13-NEXT: s_mov_b32 s2, -1 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: mul64_in_branch: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU_PUSH_BEFORE 4, @14, KC0[CB0:0-32], KC1[] @@ -3278,6 +3614,50 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX1250-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: s_mul_i128: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_clause 0x2 +; GFX13-NEXT: s_load_b128 s[8:11], s[4:5], 0x7c nv +; GFX13-NEXT: s_load_b128 s[12:15], s[4:5], 0x4c nv +; GFX13-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 nv +; GFX13-NEXT: s_mov_b64 s[4:5], 0xffffffff +; GFX13-NEXT: s_mov_b32 s3, 0 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_mov_b32 s7, s3 +; GFX13-NEXT: s_mov_b32 s17, s3 +; GFX13-NEXT: s_mov_b32 s19, s3 +; GFX13-NEXT: s_mov_b32 s20, s3 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s2, s8 +; GFX13-NEXT: s_and_b64 s[4:5], s[12:13], s[4:5] +; GFX13-NEXT: s_mov_b32 s6, s13 +; GFX13-NEXT: s_mul_u64 s[22:23], s[4:5], s[2:3] +; GFX13-NEXT: s_mul_u64 s[24:25], s[6:7], s[2:3] +; GFX13-NEXT: s_mov_b32 s2, s23 +; GFX13-NEXT: s_mov_b32 s16, s9 +; GFX13-NEXT: s_mul_u64 s[10:11], s[10:11], s[12:13] +; GFX13-NEXT: s_add_nc_u64 s[12:13], s[24:25], s[2:3] +; GFX13-NEXT: s_mul_u64 s[4:5], s[4:5], s[16:17] +; GFX13-NEXT: s_mov_b32 s2, s13 +; GFX13-NEXT: s_mov_b32 s13, s3 +; GFX13-NEXT: s_mul_u64 s[8:9], s[8:9], s[14:15] +; GFX13-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[12:13] +; GFX13-NEXT: s_mul_u64 s[6:7], s[6:7], s[16:17] +; GFX13-NEXT: s_mov_b32 s18, s5 +; GFX13-NEXT: s_mov_b32 s23, s3 +; GFX13-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19] +; GFX13-NEXT: s_add_nc_u64 s[8:9], s[10:11], s[8:9] +; GFX13-NEXT: s_mov_b32 s21, s4 +; GFX13-NEXT: s_add_nc_u64 s[2:3], s[6:7], s[2:3] +; GFX13-NEXT: s_or_b64 s[4:5], s[22:23], s[20:21] +; GFX13-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[8:9] +; GFX13-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX13-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX13-NEXT: s_mov_b32 s3, 0x31016000 +; GFX13-NEXT: s_mov_b32 s2, -1 +; GFX13-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: s_mul_i128: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 41, @4, KC0[CB0:0-32], KC1[] @@ -3591,6 +3971,42 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[2:3] scale_offset ; GFX1250-NEXT: s_endpgm ; +; GFX13-LABEL: v_mul_i128: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c nv +; GFX13-NEXT: v_and_b32_e32 v13, 0x3ff, v0 +; GFX13-NEXT: v_mov_b32_e32 v10, 0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_clause 0x1 +; GFX13-NEXT: global_load_b128 v[0:3], v13, s[0:1] scale_offset +; GFX13-NEXT: global_load_b128 v[4:7], v13, s[2:3] scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0 +; GFX13-NEXT: v_mul_lo_u32 v14, v4, v3 +; GFX13-NEXT: v_mul_lo_u32 v7, v7, v0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10] +; GFX13-NEXT: v_mov_b32_e32 v9, v11 +; GFX13-NEXT: v_mul_lo_u32 v11, v5, v2 +; GFX13-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v2, 0 +; GFX13-NEXT: v_mul_lo_u32 v4, v6, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX13-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[9:10] +; GFX13-NEXT: v_add3_u32 v3, v3, v14, v11 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_u32 v10, s0, v12, v10 +; GFX13-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3] +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_add3_u32 v3, v7, v3, v4 +; GFX13-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_add_co_ci_u32_e64 v11, null, v1, v3, vcc_lo +; GFX13-NEXT: global_store_b128 v13, v[8:11], s[2:3] scale_offset +; GFX13-NEXT: s_endpgm +; ; EG-LABEL: v_mul_i128: ; EG: ; %bb.0: ; %entry ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] @@ -3710,6 +4126,16 @@ define i32 @mul_pow2_plus_1(i32 %val) { ; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 3, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; +; GFX13-LABEL: mul_pow2_plus_1: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_lshl_add_u32 v0, v0, 3, v0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] +; ; EG-LABEL: mul_pow2_plus_1: ; EG: ; %bb.0: ; EG-NEXT: CF_END diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll index 9029e5f724e27..74bd338c7c022 100644 --- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll +++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll @@ -7,6 +7,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1310 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX13,GFX13-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1310 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX13,GFX13-REAL16 %s ; We want to undo these canonicalizations to enable mad matching: ; (x * y) + x --> x * (y + 1) @@ -45,6 +47,16 @@ define i32 @v_mul_add_1_i32(i32 %x, i32 %y) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[0:1] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i32 %y, 1 %mul = mul i32 %x, %add ret i32 %mul @@ -83,6 +95,16 @@ define i32 @v_mul_add_1_i32_commute(i32 %x, i32 %y) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_i32_commute: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[0:1] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i32 %y, 1 %mul = mul i32 %add, %x ret i32 %mul @@ -121,6 +143,16 @@ define i32 @v_mul_add_x_i32(i32 %x, i32 %y) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_x_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[0:1] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i32 %x, %y %add = add i32 %x, %mul ret i32 %add @@ -163,6 +195,18 @@ define i32 @v_mul_sub_1_i32(i32 %x, i32 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub i32 %y, 1 %mul = mul i32 %x, %sub ret i32 %mul @@ -205,6 +249,18 @@ define i32 @v_mul_sub_1_i32_commute(i32 %x, i32 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_lo_u32 v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_i32_commute: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v0, v1, v0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub i32 %y, 1 %mul = mul i32 %sub, %x ret i32 %mul @@ -247,6 +303,18 @@ define i32 @v_mul_sub_x_i32(i32 %x, i32 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_x_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_sub_nc_u32_e32 v0, v1, v0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i32 %x, %y %sub = sub i32 %mul, %x ret i32 %sub @@ -289,6 +357,18 @@ define i32 @v_mul_add_2_i32(i32 %x, i32 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_2_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, 2, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i32 %y, 2 %mul = mul i32 %x, %add ret i32 %mul @@ -331,6 +411,18 @@ define i32 @v_mul_sub_2_i32(i32 %x, i32 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_2_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, -2, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub i32 %y, 2 %mul = mul i32 %x, %sub ret i32 %mul @@ -373,6 +465,18 @@ define i32 @v_mul_add_65_i32(i32 %x, i32 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_65_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, 0x41, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i32 %y, 65 %mul = mul i32 %x, %add ret i32 %mul @@ -415,6 +519,18 @@ define i32 @v_mul_sub_65_i32(i32 %x, i32 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_65_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub i32 %y, 65 %mul = mul i32 %x, %sub ret i32 %mul @@ -457,6 +573,18 @@ define i24 @v_mul_add_1_i24_zext(i24 zeroext %x, i24 zeroext %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_i24_zext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i24 %y, 1 %mul = mul i24 %x, %add ret i24 %mul @@ -499,6 +627,18 @@ define i24 @v_mul_sub_1_i24_zext(i24 zeroext %x, i24 zeroext %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_i24_zext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub i24 %y, 1 %mul = mul i24 %x, %sub ret i24 %mul @@ -535,6 +675,16 @@ define i24 @v_add_mul_i24_zext_1(i24 zeroext %x, i24 zeroext %y) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mad_u32_u24 v0, v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_add_mul_i24_zext_1: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_u32_u24 v0, v0, v1, v0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i24 %x, %y %add = add i24 %mul, %x ret i24 %add @@ -577,6 +727,18 @@ define i24 @v_mul_add_1_i24_sext(i24 signext %x, i24 signext %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_i24_sext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i24 %y, 1 %mul = mul i24 %x, %add ret i24 %mul @@ -613,6 +775,16 @@ define i24 @v_add_mul_i24_sext_1(i24 signext %x, i24 signext %y) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mad_u32_u24 v0, v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_add_mul_i24_sext_1: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_u32_u24 v0, v0, v1, v0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i24 %x, %y %add = add i24 %mul, %x ret i24 %add @@ -655,6 +827,18 @@ define i24 @v_mul_sub_1_i24_sext(i24 signext %x, i24 signext %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_i24_sext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, -1, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub i24 %y, 1 %mul = mul i24 %x, %sub ret i24 %mul @@ -693,6 +877,16 @@ define i25 @v_mul_add_1_i25_zext(i25 zeroext %x, i25 zeroext %y) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_i25_zext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[0:1] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i25 %y, 1 %mul = mul i25 %x, %add ret i25 %mul @@ -735,6 +929,18 @@ define i25 @v_mul_sub_1_i25_zext(i25 zeroext %x, i25 zeroext %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_i25_zext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, 0x1ffffff, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub i25 %y, 1 %mul = mul i25 %x, %sub ret i25 %mul @@ -773,6 +979,16 @@ define i25 @v_mul_add_1_i25_sext(i25 signext %x, i25 signext %y) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_i25_sext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[0:1] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i25 %y, 1 %mul = mul i25 %x, %add ret i25 %mul @@ -815,6 +1031,18 @@ define i25 @v_mul_sub_1_i25_sext(i25 signext %x, i25 signext %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_i25_sext: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, 0x1ffffff, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub i25 %y, 1 %mul = mul i25 %x, %sub ret i25 %mul @@ -861,6 +1089,26 @@ define i16 @v_mul_add_1_i16(i16 %x, i16 %y) { ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_add_1_i16: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_add_1_i16: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i16 %y, 1 %mul = mul i16 %x, %add ret i16 %mul @@ -912,6 +1160,29 @@ define i32 @v_mul_add_1_i16_zext_result(i16 %x, i16 %y) { ; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l ; GFX1250-REAL16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_add_1_i16_zext_result: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX13-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_add_1_i16_zext_result: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX13-REAL16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i16 %y, 1 %mul = mul i16 %x, %add %zext = zext i16 %mul to i32 @@ -959,6 +1230,26 @@ define i16 @v_mul_add_1_i16_commute(i16 %x, i16 %y) { ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_add_1_i16_commute: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_add_1_i16_commute: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i16 %y, 1 %mul = mul i16 %add, %x ret i16 %mul @@ -1004,6 +1295,26 @@ define i16 @v_mul_add_x_i16(i16 %x, i16 %y) { ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_add_x_i16: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_add_x_i16: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %mul = mul i16 %x, %y %add = add i16 %x, %mul ret i16 %add @@ -1057,6 +1368,30 @@ define i16 @v_mul_sub_1_i16(i16 %x, i16 %y) { ; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_sub_1_i16: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_add_nc_u16 v1, v1, -1 +; GFX13-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_sub_1_i16: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_add_nc_u16 v1.l, v1.l, -1 +; GFX13-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %sub = sub i16 %y, 1 %mul = mul i16 %x, %sub ret i16 %mul @@ -1110,6 +1445,30 @@ define i16 @v_mul_sub_1_i16_commute(i16 %x, i16 %y) { ; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-REAL16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_sub_1_i16_commute: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_add_nc_u16 v1, v1, -1 +; GFX13-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-FAKE16-NEXT: v_mul_lo_u16 v0, v1, v0 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_sub_1_i16_commute: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_add_nc_u16 v1.l, v1.l, -1 +; GFX13-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-REAL16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %sub = sub i16 %y, 1 %mul = mul i16 %sub, %x ret i16 %mul @@ -1163,6 +1522,30 @@ define i16 @v_mul_sub_x_i16(i16 %x, i16 %y) { ; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-REAL16-NEXT: v_sub_nc_u16 v0.l, v0.h, v0.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_sub_x_i16: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mul_lo_u16 v1, v0, v1 +; GFX13-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-FAKE16-NEXT: v_sub_nc_u16 v0, v1, v0 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_sub_x_i16: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mul_lo_u16 v0.h, v0.l, v1.l +; GFX13-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-REAL16-NEXT: v_sub_nc_u16 v0.l, v0.h, v0.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %mul = mul i16 %x, %y %sub = sub i16 %mul, %x ret i16 %sub @@ -1216,6 +1599,30 @@ define i16 @v_mul_add_2_i16(i16 %x, i16 %y) { ; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_add_2_i16: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_add_nc_u16 v1, v1, 2 +; GFX13-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_add_2_i16: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_add_nc_u16 v1.l, v1.l, 2 +; GFX13-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i16 %y, 2 %mul = mul i16 %x, %add ret i16 %mul @@ -1269,6 +1676,30 @@ define i16 @v_mul_sub_2_i16(i16 %x, i16 %y) { ; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_sub_2_i16: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_add_nc_u16 v1, v1, -2 +; GFX13-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_sub_2_i16: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_add_nc_u16 v1.l, v1.l, -2 +; GFX13-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-REAL16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %sub = sub i16 %y, 2 %mul = mul i16 %x, %sub ret i16 %mul @@ -1341,6 +1772,21 @@ define i64 @v_mul_add_1_i64(i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-NEXT: v_mov_b32_e32 v0, v4 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[4:5], null, v0, v2, v[0:1] +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v3 +; GFX13-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX13-NEXT: v_add3_u32 v1, v1, v5, v0 +; GFX13-NEXT: v_mov_b32_e32 v0, v4 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i64 %y, 1 %mul = mul i64 %x, %add ret i64 %mul @@ -1413,6 +1859,21 @@ define i64 @v_mul_add_1_i64_commute(i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-NEXT: v_mov_b32_e32 v0, v4 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_i64_commute: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[4:5], null, v0, v2, v[0:1] +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v3 +; GFX13-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX13-NEXT: v_add3_u32 v1, v1, v5, v0 +; GFX13-NEXT: v_mov_b32_e32 v0, v4 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i64 %y, 1 %mul = mul i64 %add, %x ret i64 %mul @@ -1485,6 +1946,21 @@ define i64 @v_mul_add_x_i64(i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-NEXT: v_mov_b32_e32 v0, v4 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_x_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[4:5], null, v0, v2, v[0:1] +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v3 +; GFX13-NEXT: v_mul_lo_u32 v1, v1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX13-NEXT: v_add3_u32 v1, v1, v5, v0 +; GFX13-NEXT: v_mov_b32_e32 v0, v4 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i64 %x, %y %add = add i64 %x, %mul ret i64 %add @@ -1560,6 +2036,23 @@ define i64 @v_mul_sub_1_i64(i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_co_u32 v2, vcc_lo, v2, -1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX13-NEXT: v_mul_lo_u32 v4, v1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, 0 +; GFX13-NEXT: v_add3_u32 v1, v1, v3, v4 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub i64 %y, 1 %mul = mul i64 %x, %sub ret i64 %mul @@ -1635,6 +2128,23 @@ define i64 @v_mul_sub_1_i64_commute(i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[2:3], v[0:1] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_i64_commute: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_co_u32 v2, vcc_lo, v2, -1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX13-NEXT: v_mul_lo_u32 v4, v2, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v3, v3, v0 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v0, 0 +; GFX13-NEXT: v_add3_u32 v1, v1, v4, v3 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub i64 %y, 1 %mul = mul i64 %sub, %x ret i64 %mul @@ -1708,6 +2218,23 @@ define i64 @v_mul_sub_x_i64(i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[2:3], v[0:1] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_x_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mul_lo_u32 v4, v1, v2 +; GFX13-NEXT: v_mul_lo_u32 v5, v0, v3 +; GFX13-NEXT: v_mad_co_u64_u32 v[2:3], null, v0, v2, 0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_add3_u32 v3, v3, v5, v4 +; GFX13-NEXT: v_sub_co_u32 v0, vcc_lo, v2, v0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_sub_co_ci_u32_e64 v1, null, v3, v1, vcc_lo +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i64 %x, %y %sub = sub i64 %mul, %x ret i64 %sub @@ -1783,6 +2310,23 @@ define i64 @v_mul_add_2_i64(i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_2_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_co_u32 v2, vcc_lo, v2, 2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX13-NEXT: v_mul_lo_u32 v4, v1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, 0 +; GFX13-NEXT: v_add3_u32 v1, v1, v3, v4 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i64 %y, 2 %mul = mul i64 %x, %add ret i64 %mul @@ -1858,6 +2402,23 @@ define i64 @v_mul_sub_2_i64(i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_2_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_co_u32 v2, vcc_lo, v2, -2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX13-NEXT: v_mul_lo_u32 v4, v1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v3, v0, v3 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, 0 +; GFX13-NEXT: v_add3_u32 v1, v1, v3, v4 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub i64 %y, 2 %mul = mul i64 %x, %sub ret i64 %mul @@ -1914,6 +2475,19 @@ define <2 x i32> @v_mul_add_1_i32_multiple(i32 %x, i32 %y, i32 %z) { ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, v0 ; GFX1250-NEXT: v_mad_u32 v1, v2, v1, v2 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_i32_multiple: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mov_b32_e32 v3, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v3, v[0:1] +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v3, v[2:3] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i32 %y, 1 %mul0 = mul i32 %x, %add %mul1 = mul i32 %z, %add @@ -1959,6 +2533,18 @@ define <2 x i32> @v_mul_add_1_i32_other_use(i32 %x, i32 %y, i32 %z) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_i32_other_use: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v1, 1, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add i32 %y, 1 %mul0 = mul i32 %x, %add %mul1 = mul i32 %z, %add @@ -2022,6 +2608,22 @@ define i32 @v_mul_add_1_i32_chain(i32 %arg0, i32 %arg1, i32 %arg2) { ; GFX1250-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX1250-NEXT: v_mad_u32 v0, v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_i32_chain: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_add_nc_u32_e32 v2, 1, v0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v1, v2, v1 +; GFX13-NEXT: v_add_nc_u32_e32 v2, v1, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v0, v2, v0 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[0:1] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %i2 = add i32 %arg0, 1 %i3 = mul i32 %i2, %arg1 %i4 = add i32 %i3, %i2 @@ -2081,6 +2683,18 @@ define <2 x i16> @v_mul_add_1_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_v2i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i16> %y, %mul = mul <2 x i16> %x, %add ret <2 x i16> %mul @@ -2137,6 +2751,18 @@ define <2 x i16> @v_mul_add_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_mul_lo_u16 v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_v2i16_commute: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i16> %y, %mul = mul <2 x i16> %add, %x ret <2 x i16> %mul @@ -2186,6 +2812,16 @@ define <2 x i16> @v_mul_add_x_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_mad_u16 v0, v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_x_v2i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_pk_mad_u16 v0, v0, v1, v0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul <2 x i16> %x, %y %add = add <2 x i16> %x, %mul ret <2 x i16> %add @@ -2242,6 +2878,18 @@ define <2 x i16> @v_mul_sub_1_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_v2i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_pk_sub_i16 v1, v1, 1 op_sel_hi:[1,0] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub <2 x i16> %y, %mul = mul <2 x i16> %x, %sub ret <2 x i16> %mul @@ -2298,6 +2946,18 @@ define <2 x i16> @v_mul_sub_1_v2i16_commute(<2 x i16> %x, <2 x i16> %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_mul_lo_u16 v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_v2i16_commute: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_pk_sub_i16 v1, v1, 1 op_sel_hi:[1,0] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_pk_mul_lo_u16 v0, v1, v0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub <2 x i16> %y, %mul = mul <2 x i16> %sub, %x ret <2 x i16> %mul @@ -2352,6 +3012,18 @@ define <2 x i16> @v_mul_sub_x_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_x_v2i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_pk_mul_lo_u16 v1, v0, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul <2 x i16> %x, %y %sub = sub <2 x i16> %mul, %x ret <2 x i16> %sub @@ -2408,6 +3080,18 @@ define <2 x i16> @v_mul_add_2_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_2_v2i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_pk_add_u16 v1, v1, 2 op_sel_hi:[1,0] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i16> %y, %mul = mul <2 x i16> %x, %add ret <2 x i16> %mul @@ -2464,6 +3148,18 @@ define <2 x i16> @v_mul_sub_2_v2i16(<2 x i16> %x, <2 x i16> %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_mul_lo_u16 v0, v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_2_v2i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_pk_sub_i16 v1, v1, 2 op_sel_hi:[1,0] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_pk_mul_lo_u16 v0, v0, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub <2 x i16> %y, %mul = mul <2 x i16> %x, %sub ret <2 x i16> %mul @@ -2520,6 +3216,19 @@ define <2 x i32> @v_mul_add_1_v2i32(<2 x i32> %x, <2 x i32> %y) { ; GFX1250-NEXT: v_mad_u32 v0, v0, v2, v0 ; GFX1250-NEXT: v_mad_u32 v1, v1, v3, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_v2i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[4:5], null, v0, v2, v[0:1] +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v3, v[1:2] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_mov_b32_e32 v0, v4 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i32> %y, %mul = mul <2 x i32> %x, %add ret <2 x i32> %mul @@ -2576,6 +3285,19 @@ define <2 x i32> @v_mul_add_1_v2i32_commute(<2 x i32> %x, <2 x i32> %y) { ; GFX1250-NEXT: v_mad_u32 v0, v0, v2, v0 ; GFX1250-NEXT: v_mad_u32 v1, v1, v3, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_v2i32_commute: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[4:5], null, v0, v2, v[0:1] +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v3, v[1:2] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_mov_b32_e32 v0, v4 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i32> %y, %mul = mul <2 x i32> %add, %x ret <2 x i32> %mul @@ -2632,6 +3354,19 @@ define <2 x i32> @v_mul_add_x_v2i32(<2 x i32> %x, <2 x i32> %y) { ; GFX1250-NEXT: v_mad_u32 v0, v0, v2, v0 ; GFX1250-NEXT: v_mad_u32 v1, v1, v3, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_x_v2i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[4:5], null, v0, v2, v[0:1] +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v3, v[1:2] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX13-NEXT: v_mov_b32_e32 v0, v4 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul <2 x i32> %x, %y %add = add <2 x i32> %x, %mul ret <2 x i32> %add @@ -2683,6 +3418,19 @@ define <2 x i32> @v_mul_sub_1_v2i32(<2 x i32> %x, <2 x i32> %y) { ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_v2i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_add_nc_u32 v2, -1, v2 :: v_dual_add_nc_u32 v3, -1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX13-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub <2 x i32> %y, %mul = mul <2 x i32> %x, %sub ret <2 x i32> %mul @@ -2734,6 +3482,19 @@ define <2 x i32> @v_mul_sub_1_v2i32_commute(<2 x i32> %x, <2 x i32> %y) { ; GFX1250-NEXT: v_mul_lo_u32 v0, v2, v0 ; GFX1250-NEXT: v_mul_lo_u32 v1, v3, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_v2i32_commute: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_add_nc_u32 v2, -1, v2 :: v_dual_add_nc_u32 v3, -1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mul_lo_u32 v0, v2, v0 +; GFX13-NEXT: v_mul_lo_u32 v1, v3, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub <2 x i32> %y, %mul = mul <2 x i32> %sub, %x ret <2 x i32> %mul @@ -2794,6 +3555,19 @@ define <2 x i32> @v_mul_sub_x_v2i32(<2 x i32> %x, <2 x i32> %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_dual_sub_nc_u32 v0, v2, v0 :: v_dual_sub_nc_u32 v1, v3, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_x_v2i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mul_lo_u32 v2, v0, v2 +; GFX13-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_dual_sub_nc_u32 v0, v2, v0 :: v_dual_sub_nc_u32 v1, v3, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul <2 x i32> %x, %y %sub = sub <2 x i32> %mul, %x ret <2 x i32> %sub @@ -2845,6 +3619,19 @@ define <2 x i32> @v_mul_add_2_v2i32(<2 x i32> %x, <2 x i32> %y) { ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_2_v2i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_add_nc_u32 v2, 2, v2 :: v_dual_add_nc_u32 v3, 2, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX13-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i32> %y, %mul = mul <2 x i32> %x, %add ret <2 x i32> %mul @@ -2896,6 +3683,19 @@ define <2 x i32> @v_mul_sub_2_v2i32(<2 x i32> %x, <2 x i32> %y) { ; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v2 ; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v3 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_2_v2i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_add_nc_u32 v2, -2, v2 :: v_dual_add_nc_u32 v3, -2, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mul_lo_u32 v0, v0, v2 +; GFX13-NEXT: v_mul_lo_u32 v1, v1, v3 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub <2 x i32> %y, %mul = mul <2 x i32> %x, %sub ret <2 x i32> %mul @@ -2947,6 +3747,19 @@ define <2 x i24> @v_mul_add_1_v2i24(<2 x i24> %x, <2 x i24> %y) { ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX1250-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_v2i24: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_add_nc_u32 v2, 1, v2 :: v_dual_add_nc_u32 v3, 1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX13-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i24> %y, %mul = mul <2 x i24> %x, %add ret <2 x i24> %mul @@ -2998,6 +3811,19 @@ define <2 x i24> @v_mul_add_1_v2i24_commute(<2 x i24> %x, <2 x i24> %y) { ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v2, v0 ; GFX1250-NEXT: v_mul_u32_u24_e32 v1, v3, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_1_v2i24_commute: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_add_nc_u32 v2, 1, v2 :: v_dual_add_nc_u32 v3, 1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX13-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i24> %y, %mul = mul <2 x i24> %add, %x ret <2 x i24> %mul @@ -3039,6 +3865,17 @@ define <2 x i24> @v_mul_add_x_v2i24(<2 x i24> %x, <2 x i24> %y) { ; GFX1250-NEXT: v_mad_u32_u24 v0, v0, v2, v0 ; GFX1250-NEXT: v_mad_u32_u24 v1, v1, v3, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_x_v2i24: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_u32_u24 v0, v0, v2, v0 +; GFX13-NEXT: v_mad_u32_u24 v1, v1, v3, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul <2 x i24> %x, %y %add = add <2 x i24> %x, %mul ret <2 x i24> %add @@ -3090,6 +3927,19 @@ define <2 x i24> @v_mul_sub_1_v2i24(<2 x i24> %x, <2 x i24> %y) { ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX1250-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_v2i24: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_add_nc_u32 v2, -1, v2 :: v_dual_add_nc_u32 v3, -1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX13-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub <2 x i24> %y, %mul = mul <2 x i24> %x, %sub ret <2 x i24> %mul @@ -3141,6 +3991,19 @@ define <2 x i24> @v_mul_sub_1_v2i24_commute(<2 x i24> %x, <2 x i24> %y) { ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v2, v0 ; GFX1250-NEXT: v_mul_u32_u24_e32 v1, v3, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_1_v2i24_commute: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_add_nc_u32 v2, -1, v2 :: v_dual_add_nc_u32 v3, -1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v2, v0 +; GFX13-NEXT: v_mul_u32_u24_e32 v1, v3, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub <2 x i24> %y, %mul = mul <2 x i24> %sub, %x ret <2 x i24> %mul @@ -3192,6 +4055,19 @@ define <2 x i24> @v_mul_sub_x_v2i24(<2 x i24> %x, <2 x i24> %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_dual_sub_nc_u32 v0, v2, v0 :: v_dual_sub_nc_u32 v1, v3, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_x_v2i24: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mul_u32_u24_e32 v2, v0, v2 +; GFX13-NEXT: v_mul_u32_u24_e32 v3, v1, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_dual_sub_nc_u32 v0, v2, v0 :: v_dual_sub_nc_u32 v1, v3, v1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul <2 x i24> %x, %y %sub = sub <2 x i24> %mul, %x ret <2 x i24> %sub @@ -3243,6 +4119,19 @@ define <2 x i24> @v_mul_add_2_v2i24(<2 x i24> %x, <2 x i24> %y) { ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX1250-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_add_2_v2i24: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_add_nc_u32 v2, 2, v2 :: v_dual_add_nc_u32 v3, 2, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX13-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i24> %y, %mul = mul <2 x i24> %x, %add ret <2 x i24> %mul @@ -3294,6 +4183,19 @@ define <2 x i24> @v_mul_sub_2_v2i24(<2 x i24> %x, <2 x i24> %y) { ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v0, v2 ; GFX1250-NEXT: v_mul_u32_u24_e32 v1, v1, v3 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_sub_2_v2i24: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_dual_add_nc_u32 v2, -2, v2 :: v_dual_add_nc_u32 v3, -2, v3 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v0, v2 +; GFX13-NEXT: v_mul_u32_u24_e32 v1, v1, v3 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %sub = sub <2 x i24> %y, %mul = mul <2 x i24> %x, %sub ret <2 x i24> %mul @@ -3332,6 +4234,16 @@ define i32 @v_mul_9_add_52_i32(i32 %arg) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mad_u32 v0, v0, 9, 52 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_9_add_52_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, 9, 52 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i32 %arg, 9 %add = add i32 %mul, 52 ret i32 %add @@ -3376,6 +4288,26 @@ define i16 @v_mul_9_add_52_i16(i16 %arg) { ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, 9, 52 ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_9_add_52_i16: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, 9, 52 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_9_add_52_i16: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, 9, 52 +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %mul = mul i16 %arg, 9 %add = add i16 %mul, 52 ret i16 %add @@ -3423,6 +4355,16 @@ define <2 x i16> @v_mul_9_add_52_v2i16(<2 x i16> %arg) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_mad_u16 v0, v0, 9, 52 op_sel_hi:[1,0,0] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_9_add_52_v2i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_pk_mad_u16 v0, v0, 9, 52 op_sel_hi:[1,0,0] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul <2 x i16> %arg, %add = add <2 x i16> %mul, ret <2 x i16> %add @@ -3491,6 +4433,19 @@ define i64 @v_mul_9_add_52_i64(i64 %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mad_u32 v1, v2, 9, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_9_add_52_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mov_b32_e32 v2, v1 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, 9, 52 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, 9, v[1:2] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i64 %arg, 9 %add = add i64 %mul, 52 ret i64 %add @@ -3529,6 +4484,16 @@ define i32 @v_mul_5_add_1_i32(i32 %arg) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mad_u32 v0, v0, 5, 1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_5_add_1_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, 5, 1 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i32 %arg, 5 %add = add i32 %mul, 1 ret i32 %add @@ -3574,6 +4539,18 @@ define i32 @v_mul_284_add_82_i32(i32 %arg) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mad_u32 v0, v0, s0, 0x52 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_284_add_82_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x11c +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, s0, 0x52 +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i32 %arg, 284 %add = add i32 %mul, 82 ret i32 %add @@ -3618,6 +4595,26 @@ define i16 @v_mul_5_add_1_i16(i16 %arg) { ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, 5, 1 ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_5_add_1_i16: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, 5, 1 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_5_add_1_i16: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, 5, 1 +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %mul = mul i16 %arg, 5 %add = add i16 %mul, 1 ret i16 %add @@ -3673,6 +4670,30 @@ define i16 @v_mul_284_add_82_i16(i16 %arg) { ; GFX1250-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 0x52 ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_284_add_82_i16: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: s_movk_i32 s0, 0x11c +; GFX13-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, s0, 0x52 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_284_add_82_i16: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mov_b16_e32 v1.l, 0x11c +; GFX13-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, 0x52 +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %mul = mul i16 %arg, 284 %add = add i16 %mul, 82 ret i16 %add @@ -3720,6 +4741,16 @@ define <2 x i16> @v_mul_5_add_1_v2i16(<2 x i16> %arg) { ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_mad_u16 v0, v0, 5, 1 op_sel_hi:[1,0,0] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_5_add_1_v2i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_pk_mad_u16 v0, v0, 5, 1 op_sel_hi:[1,0,0] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul <2 x i16> %arg, %add = add <2 x i16> %mul, ret <2 x i16> %add @@ -3776,6 +4807,18 @@ define <2 x i16> @v_mul_284_add_82_v2i16(<2 x i16> %arg) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_pk_mad_u16 v0, v0, s0, 0x52 op_sel_hi:[1,0,0] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_284_add_82_v2i16: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x11c +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: v_pk_mad_u16 v0, v0, s0, 0x52 op_sel_hi:[1,0,0] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul <2 x i16> %arg, %add = add <2 x i16> %mul, ret <2 x i16> %add @@ -3844,6 +4887,19 @@ define i64 @v_mul_5_add_1_i64(i64 %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mad_u32 v1, v2, 5, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_5_add_1_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_mov_b32_e32 v2, v1 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, 5, 1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, 5, v[1:2] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i64 %arg, 5 %add = add i64 %mul, 1 ret i64 %add @@ -3927,6 +4983,20 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mad_u32 v1, 0x11c, v2, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_284_add_82_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_movk_i32 s0, 0x11c +; GFX13-NEXT: v_mov_b32_e32 v2, v1 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, s0, 0x52 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, 0x11c, v2, v[1:2] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i64 %arg, 284 %add = add i64 %mul, 82 ret i64 %add @@ -4010,6 +5080,20 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_mad_u32 v1, 0x37b4a145, v2, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: v_mul_934584645_add_8234599_i64: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_mov_b32 s0, 0x37b4a145 +; GFX13-NEXT: v_mov_b32_e32 v2, v1 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, s0, 0x7da667 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_mad_co_u64_u32 v[1:2], null, 0x37b4a145, v2, v[1:2] +; GFX13-NEXT: s_set_pc_i64 s[30:31] %mul = mul i64 %arg, 934584645 %add = add i64 %mul, 8234599 ret i64 %add @@ -4248,6 +5332,55 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) % ; GFX1250-NEXT: v_mad_u32 v2, v3, v2, v3 ; GFX1250-NEXT: global_store_b32 v[0:1], v2, off ; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: compute_mad: +; GFX13: ; %bb.0: ; %bb +; GFX13-NEXT: s_load_b96 s[0:2], s[4:5], 0x10 nv +; GFX13-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX13-NEXT: s_load_b128 s[4:7], s[4:5], 0x0 nv +; GFX13-NEXT: s_bfe_u32 s8, ttmp6, 0x4000c +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_add_co_i32 s8, s8, 1 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_add_co_i32 s2, s2, 1 +; GFX13-NEXT: s_load_b32 s6, s[6:7], 0x4 nv +; GFX13-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX13-NEXT: s_and_b32 s7, ttmp6, 15 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_dual_add_nc_u32 v2, s2, v1 :: v_dual_add_nc_u32 v1, 1, v1 +; GFX13-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv +; GFX13-NEXT: s_mul_i32 s4, ttmp9, s8 +; GFX13-NEXT: s_getreg_b32 s5, hwreg(HW_REG_IB_STS2, 6, 4) +; GFX13-NEXT: v_mul_lo_u32 v2, v2, v0 +; GFX13-NEXT: s_add_co_i32 s7, s7, s4 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: s_and_b32 s4, s6, 0xffff +; GFX13-NEXT: s_cmp_eq_u32 s5, 0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v3, v2, v1 +; GFX13-NEXT: s_cselect_b32 s5, ttmp9, s7 +; GFX13-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX13-NEXT: v_add_nc_u32_e32 v1, 1, v3 +; GFX13-NEXT: v_mul_lo_u32 v4, v2, v1 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_add_nc_u32_e32 v3, v4, v1 +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, s5, s4, v[0:1] +; GFX13-NEXT: v_mul_lo_u32 v1, v3, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_co_u32 v2, s2, s2, v0 +; GFX13-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mad_co_u64_u32 v[4:5], null, v1, v4, v[1:2] +; GFX13-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX13-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v1, v[4:5] +; GFX13-NEXT: v_add_co_u32 v1, vcc_lo, s0, v2 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-NEXT: v_add_co_ci_u32_e64 v2, null, s1, v3, vcc_lo +; GFX13-NEXT: global_store_b32 v[1:2], v0, off +; GFX13-NEXT: s_endpgm bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 %i2 = add i32 %arg1, 1 @@ -4312,6 +5445,13 @@ define amdgpu_ps i32 @s_mul_add_1_i32(i32 inreg %x, i32 inreg %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_i32 s0, s0, s1 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_add_1_i32: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_add_co_i32 s1, s1, 1 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_mul_i32 s0, s0, s1 +; GFX13-NEXT: ; return to shader part epilog %add = add i32 %y, 1 %mul = mul i32 %x, %add ret i32 %mul @@ -4349,6 +5489,13 @@ define amdgpu_ps i32 @s_mul_add_1_i32_commute(i32 inreg %x, i32 inreg %y) { ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_i32 s0, s1, s0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: s_mul_add_1_i32_commute: +; GFX13: ; %bb.0: +; GFX13-NEXT: s_add_co_i32 s1, s1, 1 +; GFX13-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX13-NEXT: s_mul_i32 s0, s1, s0 +; GFX13-NEXT: ; return to shader part epilog %add = add i32 %y, 1 %mul = mul i32 %add, %x ret i32 %mul @@ -4395,6 +5542,26 @@ define i8 @v_mul_add_1_i8(i8 %x, i8 %y) { ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_add_1_i8: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_add_1_i8: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i8 %y, 1 %mul = mul i8 %x, %add ret i8 %mul @@ -4441,6 +5608,26 @@ define i8 @v_mul_add_1_i8_commute(i8 %x, i8 %y) { ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_add_1_i8_commute: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_add_1_i8_commute: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i8 %y, 1 %mul = mul i8 %add, %x ret i8 %mul @@ -4486,6 +5673,26 @@ define i8 @v_mul_add_1_i8_zext(i8 zeroext %x, i8 zeroext %y) { ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_add_1_i8_zext: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_add_1_i8_zext: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i8 %y, 1 %mul = mul i8 %x, %add ret i8 %mul @@ -4531,6 +5738,26 @@ define i8 @v_mul_add_1_i8_zext_commute(i8 zeroext %x, i8 zeroext %y) { ; GFX1250-REAL16-NEXT: s_wait_kmcnt 0x0 ; GFX1250-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_add_1_i8_zext_commute: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, v1, v0 +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_add_1_i8_zext_commute: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v0.l +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add i8 %y, 1 %mul = mul i8 %add, %x ret i8 %mul @@ -4606,6 +5833,36 @@ define <2 x i8> @v_mul_add_1_v2i8(<2 x i8> %x, <2 x i8> %y) { ; GFX1250-REAL16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 0xff bitop3:0xec ; GFX1250-REAL16-NEXT: v_and_b16 v1.l, 0xff, v0.h ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_add_1_v2i8: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v1, v1, v3, v1 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, v2, v0 +; GFX13-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v1 +; GFX13-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX13-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, 0xff bitop3:0xec +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_add_1_v2i8: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.h, v1.l, v3.l, v1.l +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v2.l, v0.l +; GFX13-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-REAL16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h +; GFX13-REAL16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 0xff bitop3:0xec +; GFX13-REAL16-NEXT: v_and_b16 v1.l, 0xff, v0.h +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i8> %y, %mul = mul <2 x i8> %x, %add ret <2 x i8> %mul @@ -4681,6 +5938,36 @@ define <2 x i8> @v_mul_add_1_v2i8_commute(<2 x i8> %x, <2 x i8> %y) { ; GFX1250-REAL16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 0xff bitop3:0xec ; GFX1250-REAL16-NEXT: v_and_b16 v1.l, 0xff, v0.h ; GFX1250-REAL16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-FAKE16-LABEL: v_mul_add_1_v2i8_commute: +; GFX13-FAKE16: ; %bb.0: +; GFX13-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX13-FAKE16-NEXT: v_mad_u16 v1, v1, v3, v1 +; GFX13-FAKE16-NEXT: v_mad_u16 v0, v0, v2, v0 +; GFX13-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX13-FAKE16-NEXT: v_lshlrev_b16 v2, 8, v1 +; GFX13-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX13-FAKE16-NEXT: v_bitop3_b16 v0, v0, v2, 0xff bitop3:0xec +; GFX13-FAKE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-REAL16-LABEL: v_mul_add_1_v2i8_commute: +; GFX13-REAL16: ; %bb.0: +; GFX13-REAL16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-REAL16-NEXT: s_wait_expcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_samplecnt 0x0 +; GFX13-REAL16-NEXT: s_wait_bvhcnt 0x0 +; GFX13-REAL16-NEXT: s_wait_kmcnt 0x0 +; GFX13-REAL16-NEXT: v_mad_u16 v0.h, v1.l, v3.l, v1.l +; GFX13-REAL16-NEXT: v_mad_u16 v0.l, v0.l, v2.l, v0.l +; GFX13-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-REAL16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h +; GFX13-REAL16-NEXT: v_bitop3_b16 v0.l, v0.l, v1.l, 0xff bitop3:0xec +; GFX13-REAL16-NEXT: v_and_b16 v1.l, 0xff, v0.h +; GFX13-REAL16-NEXT: s_set_pc_i64 s[30:31] %add = add <2 x i8> %y, %mul = mul <2 x i8> %add, %x ret <2 x i8> %mul @@ -4734,6 +6021,20 @@ define i64 @mul_u24_with_uneven_operands(i32 %z) { ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v1, v0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: mul_u24_with_uneven_operands: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_nc_u32_e32 v1, 1, v0 +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v1, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] entry: %c = and i32 %z, 1 %d = add nuw nsw i32 %c, 1 @@ -4788,6 +6089,20 @@ define i64 @mul_u24_with_uneven_operands_swapped(i32 %z) { ; GFX1250-NEXT: v_mul_u32_u24_e32 v0, v0, v1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: mul_u24_with_uneven_operands_swapped: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_nc_u32_e32 v1, 1, v0 +; GFX13-NEXT: v_mul_u32_u24_e32 v0, v0, v1 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] entry: %c = and i32 %z, 1 %d = add nuw nsw i32 %c, 1 @@ -4843,6 +6158,20 @@ define i64 @mul_i24_with_uneven_operands(i32 %z) { ; GFX1250-NEXT: v_mul_i32_i24_e32 v0, v1, v0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: mul_i24_with_uneven_operands: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_nc_u32_e32 v1, 1, v0 +; GFX13-NEXT: v_mul_i32_i24_e32 v0, v1, v0 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] entry: %c = and i32 %z, 1 %d = add nuw nsw i32 %c, 1 @@ -4897,6 +6226,20 @@ define i64 @mul_i24_with_uneven_operands_swapped(i32 %z) { ; GFX1250-NEXT: v_mul_i32_i24_e32 v0, v0, v1 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX13-LABEL: mul_i24_with_uneven_operands_swapped: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX13-NEXT: s_wait_expcnt 0x0 +; GFX13-NEXT: s_wait_samplecnt 0x0 +; GFX13-NEXT: s_wait_bvhcnt 0x0 +; GFX13-NEXT: s_wait_kmcnt 0x0 +; GFX13-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX13-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-NEXT: v_add_nc_u32_e32 v1, 1, v0 +; GFX13-NEXT: v_mul_i32_i24_e32 v0, v0, v1 +; GFX13-NEXT: v_mov_b32_e32 v1, 0 +; GFX13-NEXT: s_set_pc_i64 s[30:31] entry: %c = and i32 %z, 1 %d = add nuw nsw i32 %c, 1 diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll index 57c4b4883f391..85a11c25fa5b4 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll @@ -1,17 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,SDAG,GCN-FAKE16,SDAG-FAKE16 %s -; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,GISEL,GCN-FAKE16,GISEL-FAKE16 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,SDAG,GCN-REAL16,SDAG-REAL16 %s -; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,GISEL,GCN-REAL16,GISEL-REAL16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX13,GFX13-SDAG,GFX13-FAKE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX13,GFX13-GISEL,GFX13-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1310 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX13,GFX13-SDAG,GFX13-TRUE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1310 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX13,GFX13-GISEL,GFX13-TRUE16 %s define amdgpu_ps float @global_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: global_load_b32_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b32_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b32_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom @@ -20,12 +30,18 @@ entry: } define amdgpu_ps float @global_load_b32_idx32(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: global_load_b32_idx32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b32_idx32: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b32_idx32: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i32 %idx %ret = load float, ptr addrspace(1) %arrayidx, align 4 @@ -33,15 +49,40 @@ entry: } define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: global_load_b32_idxprom_wrong_stride: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] -; GCN-NEXT: global_load_b32 v0, v[0:1], off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b32_idxprom_wrong_stride: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-SDAG-LABEL: global_load_b32_idxprom_wrong_stride: +; GFX13-SDAG: ; %bb.0: ; %entry +; GFX13-SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1] +; GFX13-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0 +; GFX13-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v1, vcc_lo +; GFX13-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX13-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX13-SDAG-NEXT: ; return to shader part epilog +; +; GFX13-GISEL-LABEL: global_load_b32_idxprom_wrong_stride: +; GFX13-GISEL: ; %bb.0: ; %entry +; GFX13-GISEL-NEXT: v_dual_ashrrev_i32 v1, 31, v0 :: v_dual_mov_b32 v3, s1 +; GFX13-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1] +; GFX13-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX13-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX13-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo +; GFX13-GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GFX13-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX13-GISEL-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom @@ -50,12 +91,18 @@ entry: } define amdgpu_ps float @global_load_b16_idxprom_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: global_load_b16_idxprom_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b16_idxprom_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b16_idxprom_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %idxadd = add i64 %idxprom, 16 @@ -67,12 +114,18 @@ entry: } define amdgpu_ps <2 x float> @global_load_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: global_load_b64_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b64_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b64_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom @@ -81,12 +134,18 @@ entry: } define amdgpu_ps <3 x float> @global_load_b96_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: global_load_b96_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b96_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b96_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom @@ -95,12 +154,18 @@ entry: } define amdgpu_ps <3 x float> @global_load_b96_idxpromi_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: global_load_b96_idxpromi_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b96_idxpromi_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b96_idxpromi_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %idxadd = add i64 %idxprom, 16 @@ -110,12 +175,18 @@ entry: } define amdgpu_ps <4 x float> @global_load_b128_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: global_load_b128_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b128_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b128_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom @@ -124,14 +195,22 @@ entry: } define amdgpu_ps float @global_load_b32_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { -; GCN-LABEL: global_load_b32_idxprom_range: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b32 v0, v[0:1], off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b32_idxprom_range: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b32_idxprom_range: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b32 v0, v[0:1], off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 %idxprom = sext i32 %idx to i64 @@ -141,14 +220,22 @@ entry: } define amdgpu_ps float @global_load_b32_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { -; GCN-LABEL: global_load_b32_idxprom_range_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b32 v0, v[0:1], off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_b32 v0, v0, s[0:1] offset:64 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b32_idxprom_range_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] offset:64 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b32_idxprom_range_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b32 v0, v[0:1], off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: global_load_b32 v0, v0, s[0:1] offset:64 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 %idxprom = sext i32 %idx to i64 @@ -161,14 +248,22 @@ entry: ; Note: this is a byte load, there is nothing to scale define amdgpu_ps float @global_load_b8_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { -; GCN-LABEL: global_load_b8_idxprom_range_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b32 v0, v[0:1], off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_u8 v0, v0, s[0:1] offset:16 -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b8_idxprom_range_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_load_u8 v0, v0, s[0:1] offset:16 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b8_idxprom_range_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b32 v0, v[0:1], off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: global_load_u8 v0, v0, s[0:1] offset:16 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 %idxprom = sext i32 %idx to i64 @@ -181,14 +276,22 @@ entry: } define amdgpu_ps float @global_load_b16_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { -; GCN-LABEL: global_load_b16_idxprom_range: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b32 v0, v[0:1], off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_u16 v0, v0, s[0:1] scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b16_idxprom_range: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b16_idxprom_range: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b32 v0, v[0:1], off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: global_load_u16 v0, v0, s[0:1] scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 %idxprom = sext i32 %idx to i64 @@ -200,14 +303,22 @@ entry: } define amdgpu_ps float @global_load_b16_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { -; GCN-LABEL: global_load_b16_idxprom_range_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b32 v0, v[0:1], off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b16_idxprom_range_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b16_idxprom_range_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b32 v0, v[0:1], off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 %idxprom = sext i32 %idx to i64 @@ -220,14 +331,22 @@ entry: } define amdgpu_ps <2 x float> @global_load_b64_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { -; GCN-LABEL: global_load_b64_idxprom_range: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b32 v0, v[0:1], off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b64_idxprom_range: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b64_idxprom_range: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b32 v0, v[0:1], off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 %idxprom = sext i32 %idx to i64 @@ -237,14 +356,22 @@ entry: } define amdgpu_ps <3 x float> @global_load_b96_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { -; GCN-LABEL: global_load_b96_idxprom_range: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b32 v0, v[0:1], off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b96_idxprom_range: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b96_idxprom_range: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b32 v0, v[0:1], off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 %idxprom = sext i32 %idx to i64 @@ -254,14 +381,22 @@ entry: } define amdgpu_ps <3 x float> @global_load_b96_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { -; GCN-LABEL: global_load_b96_idxprom_range_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b32 v0, v[0:1], off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b96_idxprom_range_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b96_idxprom_range_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b32 v0, v[0:1], off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 %idxprom = sext i32 %idx to i64 @@ -272,14 +407,22 @@ entry: } define amdgpu_ps <4 x float> @global_load_b128_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { -; GCN-LABEL: global_load_b128_idxprom_range: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: global_load_b32 v0, v[0:1], off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_load_b128_idxprom_range: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_load_b128_idxprom_range: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: global_load_b32 v0, v[0:1], off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 %idxprom = sext i32 %idx to i64 @@ -289,12 +432,18 @@ entry: } define amdgpu_ps void @global_store_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: global_store_b32_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: global_store_b32_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: global_store_b32_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX13-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GFX13-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom @@ -303,19 +452,31 @@ entry: } define amdgpu_ps void @global_store_b16_idxprom(ptr addrspace(1) align 2 inreg %p, i32 %idx) { -; GCN-FAKE16-LABEL: global_store_b16_idxprom: -; GCN-FAKE16: ; %bb.0: ; %entry -; GCN-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-FAKE16-NEXT: v_mov_b32_e32 v1, 1 -; GCN-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset -; GCN-FAKE16-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: global_store_b16_idxprom: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX1250-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-TRUE16-LABEL: global_store_b16_idxprom: +; GFX1250-TRUE16: ; %bb.0: ; %entry +; GFX1250-TRUE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX1250-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset +; GFX1250-TRUE16-NEXT: s_endpgm +; +; GFX13-FAKE16-LABEL: global_store_b16_idxprom: +; GFX13-FAKE16: ; %bb.0: ; %entry +; GFX13-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX13-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset +; GFX13-FAKE16-NEXT: s_endpgm ; -; GCN-REAL16-LABEL: global_store_b16_idxprom: -; GCN-REAL16: ; %bb.0: ; %entry -; GCN-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-REAL16-NEXT: v_mov_b16_e32 v1.l, 1 -; GCN-REAL16-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset -; GCN-REAL16-NEXT: s_endpgm +; GFX13-TRUE16-LABEL: global_store_b16_idxprom: +; GFX13-TRUE16: ; %bb.0: ; %entry +; GFX13-TRUE16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX13-TRUE16-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset +; GFX13-TRUE16-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom @@ -324,12 +485,18 @@ entry: } define amdgpu_ps void @global_store_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: global_store_b64_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 -; GCN-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: global_store_b64_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 1.0 +; GFX1250-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset +; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: global_store_b64_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x3ff00000 +; GFX13-NEXT: global_store_b64 v0, v[1:2], s[0:1] scale_offset +; GFX13-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds double, ptr addrspace(1) %p, i64 %idxprom @@ -338,12 +505,18 @@ entry: } define amdgpu_ps void @global_atomicrmw_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: global_atomicrmw_b32_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: global_atomicrmw_b32_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 1 +; GFX1250-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: global_atomicrmw_b32_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: v_mov_b32_e32 v1, 1 +; GFX13-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GFX13-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %idxprom @@ -352,13 +525,20 @@ entry: } define amdgpu_ps <2 x float> @global_atomicrmw_b64_rtn_idxprom(ptr addrspace(1) align 8 inreg %p, i32 %idx) { -; GCN-LABEL: global_atomicrmw_b64_rtn_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_mov_b64_e32 v[2:3], 1 -; GCN-NEXT: global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: global_atomicrmw_b64_rtn_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 1 +; GFX1250-NEXT: global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: global_atomicrmw_b64_rtn_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0 +; GFX13-NEXT: global_atomic_add_u64 v[0:1], v0, v[1:2], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %p, i64 %idxprom @@ -368,10 +548,3 @@ entry: } !0 = !{i32 0, i32 1024} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GISEL: {{.*}} -; GISEL-FAKE16: {{.*}} -; GISEL-REAL16: {{.*}} -; SDAG: {{.*}} -; SDAG-FAKE16: {{.*}} -; SDAG-REAL16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll index df24405b54dbf..8dc1757229781 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll @@ -1,16 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,SDAG,GCN-FAKE16,SDAG-FAKE16 %s -; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GCN,GISEL,GCN-FAKE16,GISEL-FAKE16 %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,SDAG,GCN-REAL16,SDAG-REAL16 %s -; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GCN,GISEL,GCN-REAL16,GISEL-REAL16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-REAL16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX1250,GFX1250-REAL16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX13,GFX13-FAKE16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1310 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX13,GFX13-FAKE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1310 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX13,GFX13-REAL16 %s +; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1310 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX13,GFX13-REAL16 %s define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) { -; GCN-LABEL: scratch_load_b32_alloca_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, off scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b32_alloca_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, off scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b32_alloca_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, off scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %p = alloca [64 x i32], align 4, addrspace(5) %idxprom = zext i32 %idx to i64 @@ -20,12 +30,18 @@ entry: } define amdgpu_ps float @scratch_load_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: scratch_load_b32_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b32_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b32_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom @@ -34,12 +50,18 @@ entry: } define amdgpu_ps float @scratch_load_b32_idx32(ptr addrspace(5) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: scratch_load_b32_idx32: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b32_idx32: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b32_idx32: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i32 %idx %ret = load float, ptr addrspace(5) %arrayidx, align 4 @@ -47,13 +69,20 @@ entry: } define amdgpu_ps float @scratch_load_b32_idxprom_wrong_stride(ptr addrspace(5) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: scratch_load_b32_idxprom_wrong_stride: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GCN-NEXT: scratch_load_b32 v0, v0, s0 -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b32_idxprom_wrong_stride: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b32_idxprom_wrong_stride: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX13-NEXT: scratch_load_b32 v0, v0, s0 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom @@ -62,12 +91,18 @@ entry: } define amdgpu_ps float @scratch_load_b16_idxprom_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: scratch_load_b16_idxprom_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b16_idxprom_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b16_idxprom_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = sext i32 %idx to i64 %idxadd = add i64 %idxprom, 16 @@ -79,12 +114,18 @@ entry: } define amdgpu_ps <2 x float> @scratch_load_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: scratch_load_b64_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b64_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b64_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom @@ -93,12 +134,18 @@ entry: } define amdgpu_ps <3 x float> @scratch_load_b96_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: scratch_load_b96_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b96_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b96_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom @@ -107,12 +154,18 @@ entry: } define amdgpu_ps <3 x float> @scratch_load_b96_idxpromi_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: scratch_load_b96_idxpromi_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b96_idxpromi_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b96_idxpromi_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 %idxadd = add i64 %idxprom, 16 @@ -122,12 +175,18 @@ entry: } define amdgpu_ps <4 x float> @scratch_load_b128_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: scratch_load_b128_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b128_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b128_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom @@ -136,14 +195,22 @@ entry: } define amdgpu_ps float @scratch_load_b32_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { -; GCN-LABEL: scratch_load_b32_idxprom_range: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b32_idxprom_range: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b32_idxprom_range: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 %idxprom = zext i32 %idx to i64 @@ -153,14 +220,22 @@ entry: } define amdgpu_ps float @scratch_load_b32_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { -; GCN-LABEL: scratch_load_b32_idxprom_range_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: scratch_load_b32 v0, v0, s0 offset:64 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b32_idxprom_range_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, s0 offset:64 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b32_idxprom_range_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: scratch_load_b32 v0, v0, s0 offset:64 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 %idxprom = zext i32 %idx to i64 @@ -171,14 +246,22 @@ entry: } define amdgpu_ps float @scratch_load_b8_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { -; GCN-LABEL: scratch_load_b8_idxprom_range_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: scratch_load_u8 v0, v0, s0 offset:16 -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b8_idxprom_range_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_load_u8 v0, v0, s0 offset:16 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b8_idxprom_range_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: scratch_load_u8 v0, v0, s0 offset:16 +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 %idxprom = zext i32 %idx to i64 @@ -191,14 +274,22 @@ entry: } define amdgpu_ps float @scratch_load_b16_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { -; GCN-LABEL: scratch_load_b16_idxprom_range: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: scratch_load_u16 v0, v0, s0 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b16_idxprom_range: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_load_u16 v0, v0, s0 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b16_idxprom_range: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: scratch_load_u16 v0, v0, s0 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 %idxprom = zext i32 %idx to i64 @@ -210,14 +301,22 @@ entry: } define amdgpu_ps float @scratch_load_b16_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { -; GCN-LABEL: scratch_load_b16_idxprom_range_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b16_idxprom_range_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b16_idxprom_range_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 %idxprom = zext i32 %idx to i64 @@ -230,14 +329,22 @@ entry: } define amdgpu_ps <2 x float> @scratch_load_b64_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { -; GCN-LABEL: scratch_load_b64_idxprom_range: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b64_idxprom_range: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b64_idxprom_range: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 %idxprom = zext i32 %idx to i64 @@ -249,14 +356,22 @@ entry: ; Multiplication is unsigned here, so we cannot match it. define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { -; GCN-LABEL: scratch_load_b96_idxprom_range: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b96_idxprom_range: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b96_idxprom_range: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 %idxprom = sext i32 %idx to i64 @@ -266,14 +381,22 @@ entry: } define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { -; GCN-LABEL: scratch_load_b96_idxprom_range_ioffset: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b96_idxprom_range_ioffset: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b96_idxprom_range_ioffset: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 %idxprom = sext i32 %idx to i64 @@ -284,14 +407,22 @@ entry: } define amdgpu_ps <4 x float> @scratch_load_b128_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { -; GCN-LABEL: scratch_load_b128_idxprom_range: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: scratch_load_b32 v0, v0, off -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset -; GCN-NEXT: s_wait_loadcnt 0x0 -; GCN-NEXT: ; return to shader part epilog +; GFX1250-LABEL: scratch_load_b128_idxprom_range: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: scratch_load_b32 v0, v0, off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog +; +; GFX13-LABEL: scratch_load_b128_idxprom_range: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: scratch_load_b32 v0, v0, off +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset +; GFX13-NEXT: s_wait_loadcnt 0x0 +; GFX13-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 %idxprom = zext i32 %idx to i64 @@ -301,12 +432,18 @@ entry: } define amdgpu_ps void @scratch_store_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: scratch_store_b32_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: scratch_store_b32 v0, v1, s0 scale_offset -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: scratch_store_b32_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX1250-NEXT: scratch_store_b32 v0, v1, s0 scale_offset +; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: scratch_store_b32_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX13-NEXT: scratch_store_b32 v0, v1, s0 scale_offset +; GFX13-NEXT: s_endpgm entry: %idxprom = zext i32 %idx to i64 %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom @@ -315,19 +452,31 @@ entry: } define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg %p, i32 %idx) { -; GCN-FAKE16-LABEL: scratch_store_b16_idxprom: -; GCN-FAKE16: ; %bb.0: ; %entry -; GCN-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-FAKE16-NEXT: v_mov_b32_e32 v1, 1 -; GCN-FAKE16-NEXT: scratch_store_b16 v0, v1, s0 scale_offset -; GCN-FAKE16-NEXT: s_endpgm +; GFX1250-FAKE16-LABEL: scratch_store_b16_idxprom: +; GFX1250-FAKE16: ; %bb.0: ; %entry +; GFX1250-FAKE16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX1250-FAKE16-NEXT: scratch_store_b16 v0, v1, s0 scale_offset +; GFX1250-FAKE16-NEXT: s_endpgm +; +; GFX1250-REAL16-LABEL: scratch_store_b16_idxprom: +; GFX1250-REAL16: ; %bb.0: ; %entry +; GFX1250-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-REAL16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX1250-REAL16-NEXT: scratch_store_b16 v0, v1, s0 scale_offset +; GFX1250-REAL16-NEXT: s_endpgm ; -; GCN-REAL16-LABEL: scratch_store_b16_idxprom: -; GCN-REAL16: ; %bb.0: ; %entry -; GCN-REAL16-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-REAL16-NEXT: v_mov_b16_e32 v1.l, 1 -; GCN-REAL16-NEXT: scratch_store_b16 v0, v1, s0 scale_offset -; GCN-REAL16-NEXT: s_endpgm +; GFX13-FAKE16-LABEL: scratch_store_b16_idxprom: +; GFX13-FAKE16: ; %bb.0: ; %entry +; GFX13-FAKE16-NEXT: v_mov_b32_e32 v1, 1 +; GFX13-FAKE16-NEXT: scratch_store_b16 v0, v1, s0 scale_offset +; GFX13-FAKE16-NEXT: s_endpgm +; +; GFX13-REAL16-LABEL: scratch_store_b16_idxprom: +; GFX13-REAL16: ; %bb.0: ; %entry +; GFX13-REAL16-NEXT: v_mov_b16_e32 v1.l, 1 +; GFX13-REAL16-NEXT: scratch_store_b16 v0, v1, s0 scale_offset +; GFX13-REAL16-NEXT: s_endpgm entry: %idxprom = zext i32 %idx to i64 %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom @@ -336,12 +485,18 @@ entry: } define amdgpu_ps void @scratch_store_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { -; GCN-LABEL: scratch_store_b64_idxprom: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 -; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 -; GCN-NEXT: scratch_store_b64 v0, v[2:3], s0 scale_offset -; GCN-NEXT: s_endpgm +; GFX1250-LABEL: scratch_store_b64_idxprom: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 1.0 +; GFX1250-NEXT: scratch_store_b64 v0, v[2:3], s0 scale_offset +; GFX1250-NEXT: s_endpgm +; +; GFX13-LABEL: scratch_store_b64_idxprom: +; GFX13: ; %bb.0: ; %entry +; GFX13-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0x3ff00000 +; GFX13-NEXT: scratch_store_b64 v0, v[1:2], s0 scale_offset +; GFX13-NEXT: s_endpgm entry: %idxprom = zext i32 %idx to i64 %arrayidx = getelementptr inbounds double, ptr addrspace(5) %p, i64 %idxprom @@ -350,10 +505,3 @@ entry: } !0 = !{i32 0, i32 1024} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GISEL: {{.*}} -; GISEL-FAKE16: {{.*}} -; GISEL-REAL16: {{.*}} -; SDAG: {{.*}} -; SDAG-FAKE16: {{.*}} -; SDAG-REAL16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/sub_u64.ll b/llvm/test/CodeGen/AMDGPU/sub_u64.ll index 5336e81e87707..ffd0b5c793580 100644 --- a/llvm/test/CodeGen/AMDGPU/sub_u64.ll +++ b/llvm/test/CodeGen/AMDGPU/sub_u64.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GCN,GFX12 %s define amdgpu_ps <2 x float> @test_sub_u64_vv(i64 %a, i64 %b) { ; GFX12-LABEL: test_sub_u64_vv: diff --git a/llvm/test/MC/AMDGPU/gfx13_asm_valu_lit64.s b/llvm/test/MC/AMDGPU/gfx13_asm_valu_lit64.s new file mode 100644 index 0000000000000..b733830ce8d1d --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx13_asm_valu_lit64.s @@ -0,0 +1,259 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1310 %s | FileCheck --check-prefix=GFX13 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1310 %s | %extract-encodings | llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1310 --disassemble | FileCheck --check-prefix=GFX13 %s + +v_ceil_f64 v[254:255], 0x10abcdef12345678 +// GFX13: v_ceil_f64_e32 v[254:255], 0x10abcdef12345678 ; encoding: [0xfe,0x30,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cvt_f32_f64 v255, 0x10abcdef12345678 +// GFX13: v_cvt_f32_f64_e32 v255, 0x10abcdef12345678 ; encoding: [0xfe,0x1e,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cvt_i32_f64 v255, 0x10abcdef12345678 +// GFX13: v_cvt_i32_f64_e32 v255, 0x10abcdef12345678 ; encoding: [0xfe,0x06,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cvt_u32_f64 v255, 0x10abcdef12345678 +// GFX13: v_cvt_u32_f64_e32 v255, 0x10abcdef12345678 ; encoding: [0xfe,0x2a,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_floor_f64 v[254:255], 0x10abcdef12345678 +// GFX13: v_floor_f64_e32 v[254:255], 0x10abcdef12345678 ; encoding: [0xfe,0x34,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_fract_f64 v[254:255], 0x10abcdef12345678 +// GFX13: v_fract_f64_e32 v[254:255], 0x10abcdef12345678 ; encoding: [0xfe,0x7c,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_frexp_exp_i32_f64 v255, 0x10abcdef12345678 +// GFX13: v_frexp_exp_i32_f64_e32 v255, 0x10abcdef12345678 ; encoding: [0xfe,0x78,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_frexp_mant_f64 v[254:255], 0x10abcdef12345678 +// GFX13: v_frexp_mant_f64_e32 v[254:255], 0x10abcdef12345678 ; encoding: [0xfe,0x7a,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_rcp_f64 v[254:255], 0x10abcdef12345678 +// GFX13: v_rcp_f64_e32 v[254:255], 0x10abcdef12345678 ; encoding: [0xfe,0x5e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_rndne_f64 v[254:255], 0x10abcdef12345678 +// GFX13: v_rndne_f64_e32 v[254:255], 0x10abcdef12345678 ; encoding: [0xfe,0x32,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_rsq_f64 v[254:255], 0x10abcdef12345678 +// GFX13: v_rsq_f64_e32 v[254:255], 0x10abcdef12345678 ; encoding: [0xfe,0x62,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_sqrt_f64 v[254:255], 0x10abcdef12345678 +// GFX13: v_sqrt_f64_e32 v[254:255], 0x10abcdef12345678 ; encoding: [0xfe,0x68,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_trunc_f64 v[254:255], 0x10abcdef12345678 +// GFX13: v_trunc_f64_e32 v[254:255], 0x10abcdef12345678 ; encoding: [0xfe,0x2e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_add_f64 v[254:255], 0x10abcdef12345678, v[254:255] +// GFX13: v_add_f64_e32 v[254:255], 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x05,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_max_num_f64 v[254:255], 0x10abcdef12345678, v[254:255] +// GFX13: v_max_num_f64_e32 v[254:255], 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_min_num_f64 v[254:255], 0x10abcdef12345678, v[254:255] +// GFX13: v_min_num_f64_e32 v[254:255], 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_mul_f64 v[254:255], 0x10abcdef12345678, v[254:255] +// GFX13: v_mul_f64_e32 v[254:255], 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x0d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_class_f64 vcc_lo, 0x10abcdef12345678, v255 +// GFX13: v_cmp_class_f64_e32 vcc_lo, 0x10abcdef12345678, v255 ; encoding: [0xfe,0xfe,0xff,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_eq_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_eq_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_ge_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_ge_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_gt_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_gt_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_gt_i64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_gt_i64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_gt_u64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_gt_u64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_le_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_le_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_le_i64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_le_i64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_le_u64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_le_u64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_lg_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_lg_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_lt_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_lt_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_lt_i64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_lt_i64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_lt_u64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_lt_u64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_ne_i64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_ne_i64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_ne_u64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_ne_u64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_neq_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_neq_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_nge_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_nge_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_ngt_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_ngt_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_nle_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_nle_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_nlg_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_nlg_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_nlt_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_nlt_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_o_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_o_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmp_u_f64 vcc_lo, 0x10abcdef12345678, v[254:255] +// GFX13: v_cmp_u_f64_e32 vcc_lo, 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_class_f64 0x10abcdef12345678, v255 +// GFX13: v_cmpx_class_f64_e32 0x10abcdef12345678, v255 ; encoding: [0xfe,0xfe,0xff,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_eq_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_eq_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_eq_i64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_eq_i64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xa5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_eq_u64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_eq_u64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xb5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_ge_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_ge_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_ge_i64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_ge_i64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xad,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_ge_u64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_ge_u64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xbd,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_gt_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_gt_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_gt_i64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_gt_i64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_gt_u64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_gt_u64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_le_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_le_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_le_i64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_le_i64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_le_u64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_le_u64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_lg_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_lg_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_lt_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_lt_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_lt_i64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_lt_i64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_lt_u64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_lt_u64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_ne_i64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_ne_i64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_ne_u64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_ne_u64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_neq_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_neq_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_nge_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_nge_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_ngt_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_ngt_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_nle_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_nle_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_nlg_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_nlg_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_nlt_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_nlt_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_o_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_o_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_cmpx_u_f64 0x10abcdef12345678, v[254:255] +// GFX13: v_cmpx_u_f64_e32 0x10abcdef12345678, v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] + +v_ceil_f64 v[254:255], 153.1 +// GFX13: v_ceil_f64_e32 v[254:255], 0x4063233333333333 ; encoding: [0xfe,0x30,0xfc,0x7f,0x33,0x33,0x33,0x33,0x33,0x23,0x63,0x40] + +v_ceil_f64 v[254:255], 1.5e22 +// GFX13: v_ceil_f64_e32 v[254:255], 0x448969368974c05b ; encoding: [0xfe,0x30,0xfc,0x7f,0x5b,0xc0,0x74,0x89,0x36,0x69,0x89,0x44] + +// These 64-bit literals can be represented as 32-bit with encoding 255. HW behavior: +// 64 bit float: the lower 32-bit are padded with zero +// 64-bit unsigned integer: zero extended to 64 bits +// 64-bit signed integer: sign extended to 64 bits + +v_ceil_f64 v[254:255], 153.0 +// GFX13: v_ceil_f64_e32 v[254:255], 0x40632000 ; encoding: [0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40] + +v_ceil_f64 v[254:255], 0x40632000 +// GFX13: v_ceil_f64_e32 v[254:255], 0x40632000 ; encoding: [0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40] + +v_ceil_f64 v[254:255], 0x4063200000000000 +// GFX13: v_ceil_f64_e32 v[254:255], 0x40632000 ; encoding: [0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40] + +// Check inlineble literals: + +// 1.0 / (2.0 * pi) +v_ceil_f64 v[254:255], 0x3fc45f306dc9c882 +// GFX13: v_ceil_f64_e32 v[254:255], 0.15915494309189532 ; encoding: [0xf8,0x30,0xfc,0x7f] + +v_ceil_f64 v[254:255], 0.15915494309189532 +// GFX13: v_ceil_f64_e32 v[254:255], 0.15915494309189532 ; encoding: [0xf8,0x30,0xfc,0x7f] + +v_ceil_f64 v[254:255], -4.0 +// GFX13: v_ceil_f64_e32 v[254:255], -4.0 ; encoding: [0xf7,0x30,0xfc,0x7f] + +v_ceil_f64 v[254:255], 2.0 +// GFX13: v_ceil_f64_e32 v[254:255], 2.0 ; encoding: [0xf4,0x30,0xfc,0x7f] + +v_ceil_f64 v[254:255], 0.0 +// GFX13: v_ceil_f64_e32 v[254:255], 0 ; encoding: [0x80,0x30,0xfc,0x7f] + +v_ceil_f64 v[254:255], 0x0 +// GFX13: v_ceil_f64_e32 v[254:255], 0 ; encoding: [0x80,0x30,0xfc,0x7f] + +// Enforce 64-bit literal even if it fits in low 32 bits (a very small double number). +// Given the backward compatibility with the syntax allowing short hex strings representing +// high 32 bits only this is the only way to encode a small number as a hex. +// Make sure lit64() is used on printing to disambiguate short hex string. + +v_ceil_f64 v[254:255], lit64(0x7b) +// GFX13: v_ceil_f64_e32 v[254:255], lit64(0x7b) ; encoding: [0xfe,0x30,0xfc,0x7f,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00] + +v_ceil_f64 v[254:255], lit64(123) +// GFX13: v_ceil_f64_e32 v[254:255], lit64(0x7b) ; encoding: [0xfe,0x30,0xfc,0x7f,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00] + +v_ceil_f64 v[254:255], 2.1e-320 +// GFX13: v_ceil_f64_e32 v[254:255], lit64(0x109a) ; encoding: [0xfe,0x30,0xfc,0x7f,0x9a,0x10,0x00,0x00,0x00,0x00,0x00,0x00]