[llvm] [AMDGPU] Select v_lshl_add_u32 instead of v_mul_lo_u32 by constant (PR #71035)

via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 2 01:05:26 PDT 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: None (sstipanovic)

<details>
<summary>Changes</summary>

Instead of: v_mul_lo_u32 v0, v0, 5 we should generate: v_lshl_add_u32 v0, v0, 2, v0. Expand mul to shl + add which is then matched to v_lshl_add_u32.

---

Patch is 90.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/71035.diff


13 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+12) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+36-24) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+145-87) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+80-57) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (+87-40) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+36-24) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+36-24) 
- (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+89) 
- (modified) llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll (+96-50) 
- (modified) llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll (+9-6) 
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll (+7-7) 
- (modified) llvm/test/CodeGen/AMDGPU/wqm.ll (+2-2) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 607d59db7bcf709..fe72628090b6d6c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4080,6 +4080,18 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
+  // FIXME: Probably should only be done for gfx9 and onward.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+    APInt CVal = C->getAPIntValue();
+    if (isPowerOf2_32(CVal.getZExtValue() - 1)) {
+      unsigned int Sqr = Log2_32(CVal.getZExtValue() - 1);
+      SDValue Pow2Constant =
+          DAG.getConstant(Sqr, DL, N1.getValueType());
+      SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, N0, Pow2Constant);
+      return DAG.getNode(ISD::ADD, DL, VT, Shl, N0);
+    }
+  }
+
   // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
   // matching.
 
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index f8f50c7cb23a5aa..dda40a77aebda48 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
@@ -64,7 +67,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -92,8 +97,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -120,10 +126,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10W64-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10W64-NEXT:    s_endpgm
 ;
 ; GFX10W32-LABEL: add_i32_constant:
@@ -147,10 +154,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10W32-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10W32-NEXT:    s_endpgm
 ;
 ; GFX11W64-LABEL: add_i32_constant:
@@ -176,11 +184,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11W64-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11W64-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11W64-NEXT:    s_nop 0
 ; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W64-NEXT:    s_endpgm
@@ -207,11 +216,11 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11W32-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT:    global_store_b32 v2, v0, s[0:1]
 ; GFX11W32-NEXT:    s_nop 0
 ; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11W32-NEXT:    s_endpgm
@@ -1109,7 +1118,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@@ -1136,7 +1147,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s2, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1165,7 +1177,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1194,7 +1206,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W64-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1222,7 +1234,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX10W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX10W32-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1252,7 +1264,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W64-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W64-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
@@ -1284,7 +1296,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11W32-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX11W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX11W32-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 81fd166e3779f83..78395020e99fa34 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -41,42 +41,81 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
-; GFX89-LABEL: add_i32_constant:
-; GFX89:       ; %bb.0: ; %entry
-; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT:    s_mov_b64 s[6:7], exec
-; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
-; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX89-NEXT:    ; implicit-def: $vgpr1
-; GFX89-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX89-NEXT:    s_cbranch_execz .LBB0_2
-; GFX89-NEXT:  ; %bb.1:
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s8, s2
-; GFX89-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
-; GFX89-NEXT:    s_mul_i32 s2, s2, 5
-; GFX89-NEXT:    s_mov_b32 s11, 0xf000
-; GFX89-NEXT:    s_mov_b32 s10, -1
-; GFX89-NEXT:    s_mov_b32 s9, s3
-; GFX89-NEXT:    v_mov_b32_e32 v1, s2
-; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX89-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
-; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    buffer_wbinvl1_vol
-; GFX89-NEXT:  .LBB0_2:
-; GFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX89-NEXT:    s_mov_b32 s3, 0xf000
-; GFX89-NEXT:    s_mov_b32 s2, -1
-; GFX89-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GFX89-NEXT:    s_endpgm
+; GFX8-LABEL: add_i32_constant:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b64 s[6:7], exec
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX8-NEXT:    ; implicit-def: $vgpr1
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT:    s_cbranch_execz .LBB0_2
+; GFX8-NEXT:  ; %bb.1:
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s8, s2
+; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT:    s_mul_i32 s2, s2, 5
+; GFX8-NEXT:    s_mov_b32 s11, 0xf000
+; GFX8-NEXT:    s_mov_b32 s10, -1
+; GFX8-NEXT:    s_mov_b32 s9, s3
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:  .LBB0_2:
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: add_i32_constant:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    s_mov_b64 s[6:7], exec
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT:    ; implicit-def: $vgpr1
+; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT:    s_cbranch_execz .LBB0_2
+; GFX9-NEXT:  ; %bb.1:
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s8, s2
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT:    s_mul_i32 s2, s2, 5
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s9, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:  .LBB0_2:
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_add3_u32 v0, v1, v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
@@ -108,8 +147,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1064-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
 ; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
@@ -143,8 +183,9 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1032-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1032-NEXT:    s_endpgm
@@ -179,9 +220,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
+; GFX1164-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1164-NEXT:    s_mov_b32 s2, -1
 ; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_nop 0
@@ -217,9 +259,10 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
+; GFX1132-NEXT:    v_add3_u32 v0, v1, v0, s2
 ; GFX1132-NEXT:    s_mov_b32 s2, -1
 ; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1132-NEXT:    s_nop 0
@@ -858,13 +901,14 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s7, v1
 ; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
-; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v0, 2, v2
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX7LESS-NEXT:    v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s7
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s6, v0
 ; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
 ; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -1510,7 +1554,9 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7LESS-NEXT:    s_mov_b32 s2, -1
 ; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
+; GFX7LESS-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
@@ -1541,7 +1587,8 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:  .LBB6_2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s2, -1
@@ -1575,7 +1622,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:  .LBB6_2:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s2, -1
@@ -1613,7 +1660,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1064-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1064-NEXT:    s_mov_b32 s2, -1
@@ -1649,7 +1696,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1032-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1032-NEXT:    s_mov_b32 s2, -1
@@ -1686,7 +1733,7 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX1164-NEXT:    v_lshl_add_u32 v0, v0, 2, v0
 ; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/71035


More information about the llvm-commits mailing list