[llvm] [AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by constant (PR #71035)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 2 01:05:26 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (sstipanovic)
<details>
<summary>Changes</summary>
Instead of: v_mul_lo_u32 v0, v0, 5 we should generate: v_lshl_add_u32 v0, v0, 2, v0. Expand mul to shl + add which is then matched to v_lshl_add_u32.
---
Patch is 90.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/71035.diff
13 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+12)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+36-24)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+145-87)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+80-57)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (+87-40)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+36-24)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+36-24)
- (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+89)
- (modified) llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll (+96-50)
- (modified) llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll (+9-6)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/wqm.ll (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 607d59db7bcf709..fe72628090b6d6c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4080,6 +4080,18 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ // FIXME: Probably should only be done for gfx9 and onward.
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+ APInt CVal = C->getAPIntValue();
+ if (isPowerOf2_32(CVal.getZExtValue() - 1)) {
+ unsigned int Sqr = Log2_32(CVal.getZExtValue() - 1);
+ SDValue Pow2Constant =
+ DAG.getConstant(Sqr, DL, N1.getValueType());
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, N0, Pow2Constant);
+ return DAG.getNode(ISD::ADD, DL, VT, Shl, N0);
+ }
+ }
+
// Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
// matching.
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index f8f50c7cb23a5aa..dda40a77aebda48 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
-; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -64,7 +67,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -92,8 +97,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -120,10 +126,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX10W64-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -147,10 +154,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W32-NEXT: v_mov_b32_e32 v2, 0
+; GFX10W32-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -176,11 +184,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX11W64-NEXT: v_mov_b32_e32 v2, 0
+; GFX11W64-NEXT: s_waitcnt_depctr 0xfff
+; GFX11W64-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -207,11 +216,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11W32-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v2, v0, s[0:1]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1109,7 +1118,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
-; GFX6-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1136,7 +1147,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1165,7 +1177,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
-; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -1194,7 +1206,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W64-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
@@ -1222,7 +1234,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W32-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
@@ -1252,7 +1264,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W64-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
@@ -1284,7 +1296,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W32-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 81fd166e3779f83..78395020e99fa34 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -41,42 +41,81 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
-; GFX89-LABEL: add_i32_constant:
-; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b64 s[6:7], exec
-; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX89-NEXT: ; implicit-def: $vgpr1
-; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX89-NEXT: s_cbranch_execz .LBB0_2
-; GFX89-NEXT: ; %bb.1:
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
-; GFX89-NEXT: s_mul_i32 s2, s2, 5
-; GFX89-NEXT: s_mov_b32 s11, 0xf000
-; GFX89-NEXT: s_mov_b32 s10, -1
-; GFX89-NEXT: s_mov_b32 s9, s3
-; GFX89-NEXT: v_mov_b32_e32 v1, s2
-; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: .LBB0_2:
-; GFX89-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX89-NEXT: v_readfirstlane_b32 s4, v1
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
-; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX89-NEXT: s_endpgm
+; GFX8-LABEL: add_i32_constant:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT: ; implicit-def: $vgpr1
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_cbranch_execz .LBB0_2
+; GFX8-NEXT: ; %bb.1:
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s2
+; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT: s_mul_i32 s2, s2, 5
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s2
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1_vol
+; GFX8-NEXT: .LBB0_2:
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: add_i32_constant:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: ; implicit-def: $vgpr1
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB0_2
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s2
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT: s_mul_i32 s2, s2, 5
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: .LBB0_2:
+; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_add3_u32 v0, v1, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_constant:
; GFX1064: ; %bb.0: ; %entry
@@ -108,8 +147,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1064-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
@@ -143,8 +183,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1032-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
@@ -179,9 +220,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1164-NEXT: s_waitcnt_depctr 0xfff
+; GFX1164-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_nop 0
@@ -217,9 +259,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1132-NEXT: v_add3_u32 v0, v1, v0, s2
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_nop 0
@@ -858,13 +901,14 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s6, v0
+; GFX7LESS-NEXT: v_readfirstlane_b32 s7, v1
; GFX7LESS-NEXT: s_waitcnt expcnt(0)
-; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v0, 2, v2
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7LESS-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s7
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s6, v0
; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -1510,7 +1554,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -1541,7 +1587,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: .LBB6_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
@@ -1575,7 +1622,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: .LBB6_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@@ -1613,7 +1660,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX1064-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1064-NEXT: s_mov_b32 s2, -1
@@ -1649,7 +1696,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX1032-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX1032-NEXT: s_mov_b32 s2, -1
@@ -1686,7 +1733,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
+; GFX1164-NEXT: v_lshl_add_u32 v0, v0, 2, v0
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/71035
More information about the llvm-commits
mailing list