[llvm] [AMDGPU] Pre-commit tests for "lshr + mad" fold (PR #119509)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 17 06:38:56 PST 2024
================
@@ -1333,5 +1333,625 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 {
ret i48 %a
}
+define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 {
+; CI-LABEL: lshr_mad_i64_1:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_movk_i32 s4, 0xfc19
+; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT: v_mov_b32_e32 v0, v2
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_1:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_movk_i32 s4, 0xfc19
+; SI-NEXT: v_mul_hi_u32 v2, v1, s4
+; SI-NEXT: v_mul_lo_u32 v3, v1, s4
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0xfc19
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xfffffc19, v1, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 %lsh, s0xfffffffffffffc19
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_2(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_2:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_movk_i32 s4, 0xfc88
+; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v1
+; CI-NEXT: v_mov_b32_e32 v0, v2
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_2:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_movk_i32 s4, 0xfc88
+; SI-NEXT: v_mul_hi_u32 v2, v1, s4
+; SI-NEXT: v_mul_lo_u32 v3, v1, s4
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v1
+; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0xfc88
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v1, v3, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xfffffc88, v1, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v1
+; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 s0xfffffffffffffc88, %lsh
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_3(i32 %arg0, i64 %arg1) #0 {
+; CI-LABEL: lshr_mad_i64_3:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_mul_lo_u32 v3, v2, v0
+; CI-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v1, v0, 0
+; CI-NEXT: s_movk_i32 s4, 0xfc88
+; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[1:2]
+; CI-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_3:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mul_lo_u32 v2, v2, v0
+; SI-NEXT: v_mul_hi_u32 v3, v1, v0
+; SI-NEXT: s_movk_i32 s4, 0xfc88
+; SI-NEXT: v_mul_lo_u32 v0, v1, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; SI-NEXT: v_mul_hi_u32 v3, v2, s4
+; SI-NEXT: v_mul_lo_u32 v1, v2, s4
+; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v2
+; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v0, 0
+; GFX9-NEXT: v_mov_b32_e32 v6, v5
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v2, v0, v[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-NEXT: s_movk_i32 s4, 0xfc88
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, s4, v[4:5]
+; GFX9-NEXT: v_sub_u32_e32 v1, v1, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_3:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v2, v0, v[1:2]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v1, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_3:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v2, v0, v[1:2]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v5, v[3:4]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %ext = zext i32 %arg0 to i64
+ %mul1 = mul i64 %arg1, %ext
+ %lsh = lshr i64 %mul1, 32
+ %mul2 = mul i64 %lsh, s0xfffffffffffffc88
+ %mad = add i64 %mul2, %mul1
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_1(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_1:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_lshrrev_b32_e32 v2, 4, v1
+; CI-NEXT: s_movk_i32 s4, 0xfc19
+; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_1:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshrrev_b32_e32 v2, 4, v1
+; SI-NEXT: s_movk_i32 s4, 0xfc19
+; SI-NEXT: v_mul_lo_u32 v3, v2, s4
+; SI-NEXT: v_mul_hi_i32 v2, v2, s4
+; SI-NEXT: v_add_i32_e32 v0, vcc, v3, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_1:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 4, v1
+; GFX9-NEXT: s_movk_i32 s4, 0xfc19
+; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v2, s4, v[0:1]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64_negative_1:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_lshrrev_b32_e32 v4, 4, v1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_mad_i64_i32 v[2:3], null, 0xfffffc19, v4, v[0:1]
+; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_negative_1:
+; GFX1150: ; %bb.0:
+; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 4, v1
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT: v_mad_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
+; GFX1150-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_1:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 4, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1]
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr i64 %arg0, 36
+ %mul = mul i64 %lsh, s0xfffffffffffffc19
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_2:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: s_movk_i32 s4, 0xd1
+; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; CI-NEXT: v_lshlrev_b32_e32 v0, 8, v1
+; CI-NEXT: v_sub_i32_e32 v1, vcc, v3, v0
+; CI-NEXT: v_mov_b32_e32 v0, v2
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_2:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_movk_i32 s4, 0xd1
+; SI-NEXT: v_mul_hi_u32 v2, v1, s4
+; SI-NEXT: v_mul_lo_u32 v4, v1, s4
+; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v1
+; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
+; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_2:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_movk_i32 s4, 0xd1
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, s4, v[0:1]
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1
+; GFX9-NEXT: v_sub_u32_e32 v1, v3, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_negative_2:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 8, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_2:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0xd1, v1, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 8, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 %lsh, s0xffffff00000000d1
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_3:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_lshr_b64 v[2:3], v[0:1], 22
+; CI-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
+; CI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; CI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_3:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 22
+; SI-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
+; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_3:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX9-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: lshr_mad_i64_negative_3:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_3:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2
+; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %op = add i64 %arg0, 1
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 %lsh, s0xfffffffffffffc00
+ %mad = add i64 %mul, %op
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 {
+; CI-LABEL: lshr_mad_i64_negative_4:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
+; CI-NEXT: v_mul_lo_u32 v0, v1, v1
+; CI-NEXT: v_add_i32_e32 v1, vcc, v0, v3
+; CI-NEXT: v_mov_b32_e32 v0, v2
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: lshr_mad_i64_negative_4:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_mul_hi_u32 v2, v1, v0
+; SI-NEXT: v_mul_lo_u32 v3, v1, v1
+; SI-NEXT: v_mul_lo_u32 v4, v1, v0
+; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; SI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; SI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: lshr_mad_i64_negative_4:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v1, v0, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v1, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1100-LABEL: lshr_mad_i64_negative_4:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1]
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GFX1100-NEXT: v_mad_u64_u32 v[3:4], null, v1, v1, v[0:1]
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1150-LABEL: lshr_mad_i64_negative_4:
+; GFX1150: ; %bb.0:
+; GFX1150-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1150-NEXT: v_mad_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_mov_b32_e32 v0, v4
+; GFX1150-NEXT: v_mad_u64_u32 v[1:2], null, v1, v1, v[0:1]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1150-NEXT: v_mov_b32_e32 v0, v3
+; GFX1150-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lshr_mad_i64_negative_4:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v1, v0, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, v4
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v1, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %lsh = lshr i64 %arg0, 32
+ %mul = mul i64 %lsh, %arg0
+ %mad = add i64 %mul, %arg0
+
+ ret i64 %mad
+}
+
+define i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 {
----------------
arsenm wrote:
I'm not saying to change the type of the operation, only the ABI type. If i64 gives you an SGPR result, then use it. If not, you can make the return type v2i32 and bitcast
https://github.com/llvm/llvm-project/pull/119509
More information about the llvm-commits
mailing list