[llvm] [LSV] Prepare atomicrmw operands for vectorization (PR #114702)

Sun Nov 3 02:48:31 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Anshil Gandhi (gandhi56)

<details>
<summary>Changes</summary>

Insert necessary casts and inttoptr instructions
to support vectorization of atomicrmw operands.


---

Patch is 45.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114702.diff


3 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp (+91) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+238-306) 
- (added) llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/fp-atomics.ll (+50) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 02ec1d5c259cd6..8767141c5d764c 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -86,6 +86,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -338,6 +339,10 @@ class Vectorizer {
   /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
   /// in the chain is the leader, and an instr touches distance 0 from itself.
   std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
+
+  /// Attempts to prepare atomic read-modify-write instructions for
+  /// vectorization.
+  bool prepareAtomicInstOps(AtomicRMWInst *);
 };
 
 class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -419,8 +424,94 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
   return Changed ? PA : PreservedAnalyses::all();
 }
 
+/**
+ * @brief Prepare operands of atomicrmw instructions for vectorization.
+ *
+ * Ensures the given AtomicRMWInst's pointer and value operands meet type
+ * requirements and are load instructions. Inserts necessary bitcast and
+ * inttoptr instructions.
+ *
+ * @param AI Pointer to the AtomicRMWInst in question.
+ * @return true if the operands were successfully prepared, false otherwise.
+ */
+bool Vectorizer::prepareAtomicInstOps(AtomicRMWInst *AI) {
+  if (AI->isVolatile() || AI->hasMetadata("amdgpu.no.fine.grained.memory"))
+    return false;
+
+  auto *Ptr = AI->getPointerOperand();
+  auto *PtrTy = Ptr->getType();
+  auto *Val = AI->getValOperand();
+  auto *ValTy = Val->getType();
+
+  if (!PtrTy->isPointerTy())
+    return false;
+
+  // Only cast if the value operand type is
+  // <2 x half>, <2 x i16>, <4 x i8>, f32, or i32
+  bool ValTyIsOkay = false;
+  if (auto *VTy = dyn_cast<FixedVectorType>(ValTy)) {
+    if (VTy->getNumElements() == 2) {
+      if (VTy->getElementType()->isHalfTy())
+        ValTyIsOkay = true;
+      if (VTy->getElementType()->isIntegerTy(16))
+        ValTyIsOkay = true;
+    } else if (VTy->getNumElements() == 4 &&
+               VTy->getElementType()->isIntegerTy(8)) {
+      ValTyIsOkay = true;
+    }
+  } else {
+    if (ValTy->isFloatTy())
+      ValTyIsOkay = true;
+    if (ValTy->isIntegerTy(32))
+      ValTyIsOkay = true;
+  }
+  if (!ValTyIsOkay)
+    return false;
+
+  // Walk up the chain of instructions to find the load instruction
+  auto GetLoadInst = [](Value *Ptr) -> LoadInst * {
+    while (Ptr) {
+      if (!isa<Instruction>(Ptr))
+        return nullptr;
+      if (auto *LI = dyn_cast<LoadInst>(Ptr))
+        return LI;
+      if (isa<GetElementPtrInst>(Ptr))
+        return nullptr;
+      if (auto *PtrInst = dyn_cast<Instruction>(Ptr))
+        Ptr = PtrInst->getOperand(0);
+      else
+        return nullptr;
+    }
+    return nullptr;
+  };
+
+  // Pointer and value operands must be load instructions to be vectorized
+  auto *ValLoadInst = GetLoadInst(Val);
+  auto *PtrLoadInst = GetLoadInst(Ptr);
+  if (!ValLoadInst || !PtrLoadInst)
+    return false;
+
+  // Insert bitcast to replace atomicrmw value operand
+  IRBuilder<> Builder(AI->getParent(), AI->getIterator());
+  ValLoadInst->mutateType(IntegerType::getInt32Ty(AI->getContext()));
+  AI->setOperand(1, Builder.CreateBitCast(ValLoadInst, ValTy,
+                                          ValLoadInst->getName() + ".bitcast"));
+
+  // Insert inttoptr to replace atomicrmw pointer operand
+  PtrLoadInst->mutateType(IntegerType::getInt32Ty(AI->getContext()));
+  AI->setOperand(0,
+                 Builder.CreateIntToPtr(PtrLoadInst, PtrTy,
+                                        PtrLoadInst->getName() + ".inttoptr"));
+  return true;
+}
+
 bool Vectorizer::run() {
   bool Changed = false;
+
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (auto *AI = dyn_cast<AtomicRMWInst>(&*I))
+      Changed |= prepareAtomicInstOps(AI);
+
   // Break up the BB if there are any instrs which aren't guaranteed to transfer
   // execution to their successor.
   //
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 495bfec5454ee8..4c6d0559593820 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -337,245 +337,222 @@ entry:
 define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) {
 ; GFX7LESS-LABEL: add_i32_uniform:
 ; GFX7LESS:       ; %bb.0: ; %entry
-; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
-; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
-; GFX7LESS-NEXT:    s_load_dword s2, s[2:3], 0xd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s9, v0
+; GFX7LESS-NEXT:    s_load_dword s4, s[2:3], 0xd
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX7LESS-NEXT:  ; %bb.1:
+; GFX7LESS-NEXT:    s_load_dword s8, s[2:3], 0xb
+; GFX7LESS-NEXT:    s_mov_b32 s9, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
-; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
 ; GFX7LESS-NEXT:    s_mov_b32 s10, -1
-; GFX7LESS-NEXT:    s_mov_b32 s8, s6
-; GFX7LESS-NEXT:    s_mov_b32 s9, s7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    buffer_wbinvl1
 ; GFX7LESS-NEXT:  .LBB1_2:
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s6, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: add_i32_uniform:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX8-NEXT:    s_load_dword s8, s[2:3], 0x34
-; GFX8-NEXT:    s_mov_b64 s[0:1], exec
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
-; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8-NEXT:    s_load_dword s8, s[2:3], 0x2c
+; GFX8-NEXT:    s_mov_b32 s9, 0
+; GFX8-NEXT:    s_mov_b32 s11, 0xf000
+; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mul_i32 s0, s8, s0
-; GFX8-NEXT:    s_mov_b32 s15, 0xf000
-; GFX8-NEXT:    s_mov_b32 s14, -1
-; GFX8-NEXT:    s_mov_b32 s12, s6
-; GFX8-NEXT:    s_mov_b32 s13, s7
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:  .LBB1_2:
-; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: add_i32_uniform:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_load_dword s8, s[2:3], 0x34
-; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    s_load_dword s8, s[2:3], 0x2c
+; GFX9-NEXT:    s_mov_b32 s9, 0
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s0, s8, s0
-; GFX9-NEXT:    s_mov_b32 s15, 0xf000
-; GFX9-NEXT:    s_mov_b32 s14, -1
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB1_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT:    s_load_dword s10, s[2:3], 0x34
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
-; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
+; GFX1064-NEXT:    s_load_dword s8, s[2:3], 0x2c
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s2, s10, s2
-; GFX1064-NEXT:    s_mov_b32 s14, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b32 s12, s6
-; GFX1064-NEXT:    s_mov_b32 s13, s7
-; GFX1064-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064-NEXT:    s_mov_b32 s9, 0
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s10, -1
+; GFX1064-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB1_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b32 null, 0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v0, s[0:1]
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s4, v0, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i32_uniform:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    s_load_dword s0, s[2:3], 0x34
-; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s8
-; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
-; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s8, s6
-; GFX1032-NEXT:    s_mov_b32 s9, s7
-; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    s_mov_b32 s5, 0
+; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB1_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 null, 0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i32_uniform:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1164-NEXT:    s_load_b32 s2, s[2:3], 0x34
-; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT:    s_load_b32 s8, s[2:3], 0x2c
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mul_i32 s3, s2, s3
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1164-NEXT:    s_mov_b32 s9, 0
+; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s10, -1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1164-NEXT:    s_mov_b32 s8, s6
-; GFX1164-NEXT:    s_mov_b32 s9, s7
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    buffer_gl1_inv
 ; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB1_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT:    s_mov_b32 s6, -1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
-; GFX1164-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
+; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s4, v0, s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i32_uniform:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX1132-NEXT:    s_load_b32 s0, s[2:3], 0x34
-; GFX1132-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s8
-; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
-; GFX1132-NEXT:    s_mov_b32 s10, -1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:    s_mov_b32 s8, s6
-; GFX1132-NEXT:    s_mov_b32 s9, s7
-; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1132-NEXT:    s_mov_b32 s5, 0
+; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT:    s_mov_b32 s6, -1
+; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    buffer_gl1_inv
 ; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB1_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s6, -1
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
 ; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
@@ -583,72 +560,62 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1264-LABEL: add_i32_uniform:
 ; GFX1264:       ; %bb.0: ; %entry
-; GFX1264-NEXT:    s_clause 0x1
-; GFX1264-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1264-NEXT:    s_load_b32 s2, s[2:3], 0x34
-; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1264-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1264-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
-; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1264-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1264-NEXT:    ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/114702