[llvm] [LSV] Prepare atomicrmw operands for vectorization (PR #114702)

Sun Nov 3 02:47:59 PST 2024

https://github.com/gandhi56 created https://github.com/llvm/llvm-project/pull/114702

Insert necessary casts and inttoptr instructions
to support vectorization of atomicrmw operands.

Change-Id: I8c3b7840ca5029b3c840359c44654ebec133fd2c


>From fd0fb2c10a504f4e1b41f5ecf1c7ba962de49f64 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <Anshil.Gandhi at amd.com>
Date: Mon, 28 Oct 2024 02:48:30 +0000
Subject: [PATCH] [LSV] Prepare atomicrmw operands for vectorization

Insert necessary casts and inttoptr instructions
to support vectorization of atomicrmw operands.

Change-Id: I8c3b7840ca5029b3c840359c44654ebec133fd2c
---
 .../Vectorize/LoadStoreVectorizer.cpp         |  91 +++
 .../atomic_optimizations_global_pointer.ll    | 544 ++++++++----------
 .../LoadStoreVectorizer/AMDGPU/fp-atomics.ll  |  50 ++
 3 files changed, 379 insertions(+), 306 deletions(-)
 create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/fp-atomics.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 02ec1d5c259cd6..8767141c5d764c 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -86,6 +86,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -338,6 +339,10 @@ class Vectorizer {
   /// Postcondition: For all i, ret[i][0].second == 0, because the first instr
   /// in the chain is the leader, and an instr touches distance 0 from itself.
   std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
+
+  /// Attempts to prepare atomic read-modify-write instructions for
+  /// vectorization.
+  bool prepareAtomicInstOps(AtomicRMWInst *);
 };
 
 class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -419,8 +424,94 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
   return Changed ? PA : PreservedAnalyses::all();
 }
 
+/**
+ * @brief Prepare operands of atomicrmw instructions for vectorization.
+ *
+ * Ensures the given AtomicRMWInst's pointer and value operands meet type
+ * requirements and are load instructions. Inserts necessary bitcast and
+ * inttoptr instructions.
+ *
+ * @param AI Pointer to the AtomicRMWInst in question.
+ * @return true if the operands were successfully prepared, false otherwise.
+ */
+bool Vectorizer::prepareAtomicInstOps(AtomicRMWInst *AI) {
+  if (AI->isVolatile() || AI->hasMetadata("amdgpu.no.fine.grained.memory"))
+    return false;
+
+  auto *Ptr = AI->getPointerOperand();
+  auto *PtrTy = Ptr->getType();
+  auto *Val = AI->getValOperand();
+  auto *ValTy = Val->getType();
+
+  if (!PtrTy->isPointerTy())
+    return false;
+
+  // Only cast if the value operand type is
+  // <2 x half>, <2 x i16>, <4 x i8>, f32, or i32
+  bool ValTyIsOkay = false;
+  if (auto *VTy = dyn_cast<FixedVectorType>(ValTy)) {
+    if (VTy->getNumElements() == 2) {
+      if (VTy->getElementType()->isHalfTy())
+        ValTyIsOkay = true;
+      if (VTy->getElementType()->isIntegerTy(16))
+        ValTyIsOkay = true;
+    } else if (VTy->getNumElements() == 4 &&
+               VTy->getElementType()->isIntegerTy(8)) {
+      ValTyIsOkay = true;
+    }
+  } else {
+    if (ValTy->isFloatTy())
+      ValTyIsOkay = true;
+    if (ValTy->isIntegerTy(32))
+      ValTyIsOkay = true;
+  }
+  if (!ValTyIsOkay)
+    return false;
+
+  // Walk up the chain of instructions to find the load instruction
+  auto GetLoadInst = [](Value *Ptr) -> LoadInst * {
+    while (Ptr) {
+      if (!isa<Instruction>(Ptr))
+        return nullptr;
+      if (auto *LI = dyn_cast<LoadInst>(Ptr))
+        return LI;
+      if (isa<GetElementPtrInst>(Ptr))
+        return nullptr;
+      if (auto *PtrInst = dyn_cast<Instruction>(Ptr))
+        Ptr = PtrInst->getOperand(0);
+      else
+        return nullptr;
+    }
+    return nullptr;
+  };
+
+  // Pointer and value operands must be load instructions to be vectorized
+  auto *ValLoadInst = GetLoadInst(Val);
+  auto *PtrLoadInst = GetLoadInst(Ptr);
+  if (!ValLoadInst || !PtrLoadInst)
+    return false;
+
+  // Insert bitcast to replace atomicrmw value operand
+  IRBuilder<> Builder(AI->getParent(), AI->getIterator());
+  ValLoadInst->mutateType(IntegerType::getInt32Ty(AI->getContext()));
+  AI->setOperand(1, Builder.CreateBitCast(ValLoadInst, ValTy,
+                                          ValLoadInst->getName() + ".bitcast"));
+
+  // Insert inttoptr to replace atomicrmw pointer operand
+  PtrLoadInst->mutateType(IntegerType::getInt32Ty(AI->getContext()));
+  AI->setOperand(0,
+                 Builder.CreateIntToPtr(PtrLoadInst, PtrTy,
+                                        PtrLoadInst->getName() + ".inttoptr"));
+  return true;
+}
+
 bool Vectorizer::run() {
   bool Changed = false;
+
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+    if (auto *AI = dyn_cast<AtomicRMWInst>(&*I))
+      Changed |= prepareAtomicInstOps(AI);
+
   // Break up the BB if there are any instrs which aren't guaranteed to transfer
   // execution to their successor.
   //
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 495bfec5454ee8..4c6d0559593820 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -337,245 +337,222 @@ entry:
 define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) {
 ; GFX7LESS-LABEL: add_i32_uniform:
 ; GFX7LESS:       ; %bb.0: ; %entry
-; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
-; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
-; GFX7LESS-NEXT:    s_load_dword s2, s[2:3], 0xd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s9, v0
+; GFX7LESS-NEXT:    s_load_dword s4, s[2:3], 0xd
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX7LESS-NEXT:  ; %bb.1:
+; GFX7LESS-NEXT:    s_load_dword s8, s[2:3], 0xb
+; GFX7LESS-NEXT:    s_mov_b32 s9, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
-; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
 ; GFX7LESS-NEXT:    s_mov_b32 s10, -1
-; GFX7LESS-NEXT:    s_mov_b32 s8, s6
-; GFX7LESS-NEXT:    s_mov_b32 s9, s7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    buffer_wbinvl1
 ; GFX7LESS-NEXT:  .LBB1_2:
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s6, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s5, v0
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: add_i32_uniform:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX8-NEXT:    s_load_dword s8, s[2:3], 0x34
-; GFX8-NEXT:    s_mov_b64 s[0:1], exec
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
-; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8-NEXT:    s_load_dword s8, s[2:3], 0x2c
+; GFX8-NEXT:    s_mov_b32 s9, 0
+; GFX8-NEXT:    s_mov_b32 s11, 0xf000
+; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mul_i32 s0, s8, s0
-; GFX8-NEXT:    s_mov_b32 s15, 0xf000
-; GFX8-NEXT:    s_mov_b32 s14, -1
-; GFX8-NEXT:    s_mov_b32 s12, s6
-; GFX8-NEXT:    s_mov_b32 s13, s7
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:  .LBB1_2:
-; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: add_i32_uniform:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_load_dword s8, s[2:3], 0x34
-; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    s_load_dword s8, s[2:3], 0x2c
+; GFX9-NEXT:    s_mov_b32 s9, 0
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s0, s8, s0
-; GFX9-NEXT:    s_mov_b32 s15, 0xf000
-; GFX9-NEXT:    s_mov_b32 s14, -1
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB1_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: add_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT:    s_load_dword s10, s[2:3], 0x34
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
-; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
+; GFX1064-NEXT:    s_load_dword s8, s[2:3], 0x2c
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s2, s10, s2
-; GFX1064-NEXT:    s_mov_b32 s14, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b32 s12, s6
-; GFX1064-NEXT:    s_mov_b32 s13, s7
-; GFX1064-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064-NEXT:    s_mov_b32 s9, 0
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s10, -1
+; GFX1064-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB1_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT:    s_mov_b32 null, 0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v0, s[0:1]
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], s4, v0, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: add_i32_uniform:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    s_load_dword s0, s[2:3], 0x34
-; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s8
-; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
-; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s8, s6
-; GFX1032-NEXT:    s_mov_b32 s9, s7
-; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    s_mov_b32 s5, 0
+; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB1_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032-NEXT:    s_mov_b32 null, 0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
 ; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX1032-NEXT:    s_endpgm
 ;
 ; GFX1164-LABEL: add_i32_uniform:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1164-NEXT:    s_load_b32 s2, s[2:3], 0x34
-; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT:    s_load_b32 s8, s[2:3], 0x2c
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mul_i32 s3, s2, s3
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1164-NEXT:    s_mov_b32 s9, 0
+; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s10, -1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1164-NEXT:    s_mov_b32 s8, s6
-; GFX1164-NEXT:    s_mov_b32 s9, s7
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    buffer_gl1_inv
 ; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB1_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT:    s_mov_b32 s6, -1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
-; GFX1164-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
+; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s4, v0, s[2:3]
+; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: add_i32_uniform:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX1132-NEXT:    s_load_b32 s0, s[2:3], 0x34
-; GFX1132-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s8
-; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
-; GFX1132-NEXT:    s_mov_b32 s10, -1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:    s_mov_b32 s8, s6
-; GFX1132-NEXT:    s_mov_b32 s9, s7
-; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1132-NEXT:    s_mov_b32 s5, 0
+; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT:    s_mov_b32 s6, -1
+; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    buffer_gl1_inv
 ; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB1_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
 ; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1132-NEXT:    s_mov_b32 s6, -1
+; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
 ; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
@@ -583,72 +560,62 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1264-LABEL: add_i32_uniform:
 ; GFX1264:       ; %bb.0: ; %entry
-; GFX1264-NEXT:    s_clause 0x1
-; GFX1264-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1264-NEXT:    s_load_b32 s2, s[2:3], 0x34
-; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1264-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1264-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
-; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1264-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT:    s_load_b32 s8, s[2:3], 0x2c
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_mul_i32 s3, s2, s3
+; GFX1264-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1264-NEXT:    s_mov_b32 s9, 0
+; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
-; GFX1264-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1264-NEXT:    s_mov_b32 s8, s6
-; GFX1264-NEXT:    s_mov_b32 s9, s7
 ; GFX1264-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB1_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX1264-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1264-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1264-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1264-NEXT:    s_mov_b32 s6, -1
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT:    v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
-; GFX1264-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX1264-NEXT:    v_mad_co_u64_u32 v[0:1], null, s4, v0, s[2:3]
+; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT:    s_mov_b32 s2, -1
+; GFX1264-NEXT:    buffer_store_b32 v0, off, s[0:3], null
 ; GFX1264-NEXT:    s_endpgm
 ;
 ; GFX1232-LABEL: add_i32_uniform:
 ; GFX1232:       ; %bb.0: ; %entry
-; GFX1232-NEXT:    s_clause 0x1
-; GFX1232-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX1232-NEXT:    s_load_b32 s0, s[2:3], 0x34
-; GFX1232-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1232-NEXT:    ; implicit-def: $vgpr1
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1232-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX1232-NEXT:  ; %bb.1:
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_bcnt1_i32_b32 s2, s8
-; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_mul_i32 s2, s0, s2
-; GFX1232-NEXT:    s_mov_b32 s10, -1
-; GFX1232-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1232-NEXT:    s_mov_b32 s8, s6
-; GFX1232-NEXT:    s_mov_b32 s9, s7
-; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1232-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1232-NEXT:    s_mov_b32 s5, 0
+; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT:    s_mov_b32 s6, -1
+; GFX1232-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB1_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1232-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX1232-NEXT:    s_mov_b32 s6, -1
+; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-NEXT:    v_mad_co_u64_u32 v[0:1], null, s0, v0, s[2:3]
 ; GFX1232-NEXT:    buffer_store_b32 v0, off, s[4:7], null
@@ -3918,170 +3885,153 @@ entry:
 define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %subitive) {
 ; GFX7LESS-LABEL: sub_i32_uniform:
 ; GFX7LESS:       ; %bb.0: ; %entry
-; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
-; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
-; GFX7LESS-NEXT:    s_load_dword s2, s[2:3], 0xd
-; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
-; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s9, v0
+; GFX7LESS-NEXT:    s_load_dword s4, s[2:3], 0xd
+; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
 ; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX7LESS-NEXT:  ; %bb.1:
+; GFX7LESS-NEXT:    s_load_dword s8, s[2:3], 0xb
+; GFX7LESS-NEXT:    s_mov_b32 s9, 0
 ; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
-; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mul_i32 s3, s2, s3
 ; GFX7LESS-NEXT:    s_mov_b32 s10, -1
-; GFX7LESS-NEXT:    s_mov_b32 s8, s6
-; GFX7LESS-NEXT:    s_mov_b32 s9, s7
-; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7LESS-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7LESS-NEXT:    buffer_wbinvl1
 ; GFX7LESS-NEXT:  .LBB7_2:
 ; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT:    s_mov_b32 s2, -1
+; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT:    s_mov_b32 s6, -1
-; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
-; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX7LESS-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: sub_i32_uniform:
 ; GFX8:       ; %bb.0: ; %entry
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX8-NEXT:    s_load_dword s8, s[2:3], 0x34
-; GFX8-NEXT:    s_mov_b64 s[0:1], exec
-; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
-; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX8-NEXT:  ; %bb.1:
-; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8-NEXT:    s_load_dword s8, s[2:3], 0x2c
+; GFX8-NEXT:    s_mov_b32 s9, 0
+; GFX8-NEXT:    s_mov_b32 s11, 0xf000
+; GFX8-NEXT:    s_mov_b32 s10, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_mul_i32 s0, s8, s0
-; GFX8-NEXT:    s_mov_b32 s15, 0xf000
-; GFX8-NEXT:    s_mov_b32 s14, -1
-; GFX8-NEXT:    s_mov_b32 s12, s6
-; GFX8-NEXT:    s_mov_b32 s13, s7
-; GFX8-NEXT:    v_mov_b32_e32 v1, s0
-; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX8-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_wbinvl1_vol
 ; GFX8-NEXT:  .LBB7_2:
-; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX8-NEXT:    s_mov_b32 s7, 0xf000
-; GFX8-NEXT:    s_mov_b32 s6, -1
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: sub_i32_uniform:
 ; GFX9:       ; %bb.0: ; %entry
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT:    s_load_dword s8, s[2:3], 0x34
-; GFX9-NEXT:    s_mov_b64 s[0:1], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    ; implicit-def: $vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT:    s_load_dword s8, s[2:3], 0x2c
+; GFX9-NEXT:    s_mov_b32 s9, 0
+; GFX9-NEXT:    s_mov_b32 s11, 0xf000
+; GFX9-NEXT:    s_mov_b32 s10, -1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_mul_i32 s0, s8, s0
-; GFX9-NEXT:    s_mov_b32 s15, 0xf000
-; GFX9-NEXT:    s_mov_b32 s14, -1
-; GFX9-NEXT:    s_mov_b32 s12, s6
-; GFX9-NEXT:    s_mov_b32 s13, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    buffer_wbinvl1_vol
 ; GFX9-NEXT:  .LBB7_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX9-NEXT:    s_mov_b32 s7, 0xf000
-; GFX9-NEXT:    s_mov_b32 s6, -1
-; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX1064-LABEL: sub_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
-; GFX1064-NEXT:    s_clause 0x1
-; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT:    s_load_dword s10, s[2:3], 0x34
-; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1064-NEXT:    s_load_dword s4, s[2:3], 0x34
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr1
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
-; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
+; GFX1064-NEXT:    s_load_dword s8, s[2:3], 0x2c
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    s_mul_i32 s2, s10, s2
-; GFX1064-NEXT:    s_mov_b32 s14, -1
-; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1064-NEXT:    s_mov_b32 s12, s6
-; GFX1064-NEXT:    s_mov_b32 s13, s7
-; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX1064-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064-NEXT:    s_mov_b32 s9, 0
+; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT:    s_mov_b32 s10, -1
+; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    buffer_gl1_inv
 ; GFX1064-NEXT:    buffer_gl0_inv
 ; GFX1064-NEXT:  .LBB7_2:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT:    v_mul_lo_u32 v0, s10, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT:    s_mov_b32 s6, -1
-; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX1064-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
+; GFX1064-NEXT:    s_mov_b32 s2, -1
+; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX1064-NEXT:    s_endpgm
 ;
 ; GFX1032-LABEL: sub_i32_uniform:
 ; GFX1032:       ; %bb.0: ; %entry
-; GFX1032-NEXT:    s_clause 0x1
-; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX1032-NEXT:    s_load_dword s0, s[2:3], 0x34
-; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr1
-; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1032-NEXT:  ; %bb.1:
-; GFX1032-NEXT:    s_bcnt1_i32_b32 s2, s8
-; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT:    s_load_dword s4, s[2:3], 0x2c
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT:    s_mul_i32 s2, s0, s2
-; GFX1032-NEXT:    s_mov_b32 s10, -1
-; GFX1032-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1032-NEXT:    s_mov_b32 s8, s6
-; GFX1032-NEXT:    s_mov_b32 s9, s7
-; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
+; GFX1032-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1032-NEXT:    s_mov_b32 s5, 0
+; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT:    s_mov_b32 s6, -1
+; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[4:7], 0 glc
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    buffer_gl1_inv
 ; GFX1032-NEXT:    buffer_gl0_inv
 ; GFX1032-NEXT:  .LBB7_2:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x24
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
@@ -4093,69 +4043,61 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1164-LABEL: sub_i32_uniform:
 ; GFX1164:       ; %bb.0: ; %entry
-; GFX1164-NEXT:    s_clause 0x1
-; GFX1164-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1164-NEXT:    s_load_b32 s2, s[2:3], 0x34
-; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1164-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr1
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1164-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT:    s_load_b32 s8, s[2:3], 0x2c
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    s_mul_i32 s3, s2, s3
+; GFX1164-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1164-NEXT:    s_mov_b32 s9, 0
+; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1164-NEXT:    s_mov_b32 s10, -1
-; GFX1164-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1164-NEXT:    s_mov_b32 s8, s6
-; GFX1164-NEXT:    s_mov_b32 s9, s7
 ; GFX1164-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    buffer_gl1_inv
 ; GFX1164-NEXT:    buffer_gl0_inv
 ; GFX1164-NEXT:  .LBB7_2:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT:    s_mov_b32 s6, -1
+; GFX1164-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX1164-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
+; GFX1164-NEXT:    s_mov_b32 s2, -1
+; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
 ; GFX1164-NEXT:    s_endpgm
 ;
 ; GFX1132-LABEL: sub_i32_uniform:
 ; GFX1132:       ; %bb.0: ; %entry
-; GFX1132-NEXT:    s_clause 0x1
-; GFX1132-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX1132-NEXT:    s_load_b32 s0, s[2:3], 0x34
-; GFX1132-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s8
-; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
-; GFX1132-NEXT:    s_mov_b32 s10, -1
-; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1132-NEXT:    s_mov_b32 s8, s6
-; GFX1132-NEXT:    s_mov_b32 s9, s7
-; GFX1132-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
+; GFX1132-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1132-NEXT:    s_mov_b32 s5, 0
+; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT:    s_mov_b32 s6, -1
+; GFX1132-NEXT:    buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    buffer_gl1_inv
 ; GFX1132-NEXT:    buffer_gl0_inv
 ; GFX1132-NEXT:  .LBB7_2:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX1132-NEXT:    v_readfirstlane_b32 s0, v1
@@ -4168,69 +4110,59 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1264-LABEL: sub_i32_uniform:
 ; GFX1264:       ; %bb.0: ; %entry
-; GFX1264-NEXT:    s_clause 0x1
-; GFX1264-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1264-NEXT:    s_load_b32 s2, s[2:3], 0x34
-; GFX1264-NEXT:    s_mov_b64 s[8:9], exec
+; GFX1264-NEXT:    s_load_b32 s4, s[2:3], 0x34
+; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1264-NEXT:    s_mov_b64 s[0:1], exec
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr1
-; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1264-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT:    s_load_b32 s8, s[2:3], 0x2c
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    s_mul_i32 s3, s2, s3
+; GFX1264-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1264-NEXT:    s_mov_b32 s9, 0
+; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
 ; GFX1264-NEXT:    s_mov_b32 s10, -1
-; GFX1264-NEXT:    v_mov_b32_e32 v1, s3
-; GFX1264-NEXT:    s_mov_b32 s8, s6
-; GFX1264-NEXT:    s_mov_b32 s9, s7
 ; GFX1264-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1264-NEXT:    s_wait_loadcnt 0x0
 ; GFX1264-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1264-NEXT:  .LBB7_2:
 ; GFX1264-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
-; GFX1264-NEXT:    v_mul_lo_u32 v0, s2, v0
-; GFX1264-NEXT:    v_readfirstlane_b32 s0, v1
-; GFX1264-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX1264-NEXT:    s_mov_b32 s6, -1
+; GFX1264-NEXT:    v_mul_lo_u32 v0, s4, v0
+; GFX1264-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX1264-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
-; GFX1264-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX1264-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
+; GFX1264-NEXT:    s_mov_b32 s2, -1
+; GFX1264-NEXT:    buffer_store_b32 v0, off, s[0:3], null
 ; GFX1264-NEXT:    s_endpgm
 ;
 ; GFX1232-LABEL: sub_i32_uniform:
 ; GFX1232:       ; %bb.0: ; %entry
-; GFX1232-NEXT:    s_clause 0x1
-; GFX1232-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
 ; GFX1232-NEXT:    s_load_b32 s0, s[2:3], 0x34
-; GFX1232-NEXT:    s_mov_b32 s8, exec_lo
+; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
 ; GFX1232-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
 ; GFX1232-NEXT:    ; implicit-def: $vgpr1
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX1232-NEXT:    s_cbranch_execz .LBB7_2
 ; GFX1232-NEXT:  ; %bb.1:
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_bcnt1_i32_b32 s2, s8
-; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT:    s_load_b32 s4, s[2:3], 0x2c
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
-; GFX1232-NEXT:    s_mul_i32 s2, s0, s2
-; GFX1232-NEXT:    s_mov_b32 s10, -1
-; GFX1232-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1232-NEXT:    s_mov_b32 s8, s6
-; GFX1232-NEXT:    s_mov_b32 s9, s7
-; GFX1232-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1232-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1232-NEXT:    s_mov_b32 s5, 0
+; GFX1232-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT:    s_mov_b32 s6, -1
+; GFX1232-NEXT:    buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB7_2:
 ; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX1232-NEXT:    s_load_b64 s[4:5], s[2:3], 0x24
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_mul_lo_u32 v0, s0, v0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s0, v1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/fp-atomics.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/fp-atomics.ll
new file mode 100644
index 00000000000000..90480a83db48e0
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/fp-atomics.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=load-store-vectorizer -S < %s | FileCheck %s
+
+; Test atomicrmw fadd with <2 x half> data type.
+define amdgpu_kernel void @func1(ptr addrspace(3) %ptr, <2 x half> %data) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @func1(
+; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]], <2 x half> [[DATA:%.*]]) {
+; CHECK-NEXT:    [[FUNC1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; CHECK-NEXT:    [[PTR_KERNARG_OFFSET1:%.*]] = bitcast ptr addrspace(4) [[FUNC1_KERNARG_SEGMENT]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(4) [[PTR_KERNARG_OFFSET1]], align 16
+; CHECK-NEXT:    [[PTR_LOAD1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[DATA_LOAD2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[FOO1_BITCAST:%.*]] = bitcast i32 [[DATA_LOAD2]] to <2 x half>
+; CHECK-NEXT:    [[FOO1_INTTOPTR:%.*]] = inttoptr i32 [[PTR_LOAD1]] to ptr addrspace(3)
+; CHECK-NEXT:    [[I1:%.*]] = atomicrmw fadd ptr addrspace(3) [[FOO1_INTTOPTR]], <2 x half> [[FOO1_BITCAST]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  %func1.kernarg.segment = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %ptr.kernarg.offset1 = bitcast ptr addrspace(4) %func1.kernarg.segment to ptr addrspace(4)
+  %ptr.load = load ptr addrspace(3), ptr addrspace(4) %ptr.kernarg.offset1, align 16
+  %data.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %func1.kernarg.segment, i64 4
+  %data.load = load <2 x half>, ptr addrspace(4) %data.kernarg.offset, align 4
+  %i1 = atomicrmw fadd ptr addrspace(3) %ptr.load, <2 x half> %data.load syncscope("agent") seq_cst, align 4
+  ret void
+}
+
+; Test atomicrmw fadd with <2 x half> data type and an i32 pointer.
+define amdgpu_kernel void @func2(i32 %ptr.as.int, <2 x half> %data) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @func2(
+; CHECK-SAME: i32 [[PTR_AS_INT:%.*]], <2 x half> [[DATA:%.*]]) {
+; CHECK-NEXT:    [[FUNC2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; CHECK-NEXT:    [[PTR_AS_INT_KERNARG_OFFSET1:%.*]] = bitcast ptr addrspace(4) [[FUNC2_KERNARG_SEGMENT]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(4) [[PTR_AS_INT_KERNARG_OFFSET1]], align 16
+; CHECK-NEXT:    [[PTR_AS_INT_LOAD:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[DATA_LOAD2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    [[PTR:%.*]] = inttoptr i32 [[PTR_AS_INT_LOAD]] to ptr addrspace(3)
+; CHECK-NEXT:    [[DATA_LOAD_BITCAST:%.*]] = bitcast i32 [[DATA_LOAD2]] to <2 x half>
+; CHECK-NEXT:    [[PTR_AS_INT_LOAD_INTTOPTR:%.*]] = inttoptr i32 [[PTR_AS_INT_LOAD]] to ptr addrspace(3)
+; CHECK-NEXT:    [[I1:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR_AS_INT_LOAD_INTTOPTR]], <2 x half> [[DATA_LOAD_BITCAST]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT:    ret void
+;
+  %func2.kernarg.segment = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+  %ptr.as.int.kernarg.offset1 = bitcast ptr addrspace(4) %func2.kernarg.segment to ptr addrspace(4)
+  %ptr.as.int.load = load i32, ptr addrspace(4) %ptr.as.int.kernarg.offset1, align 16
+  %data.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %func2.kernarg.segment, i64 4
+  %data.load = load <2 x half>, ptr addrspace(4) %data.kernarg.offset, align 4
+  %ptr = inttoptr i32 %ptr.as.int.load to ptr addrspace(3)
+  %i1 = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %data.load syncscope("agent") seq_cst, align 4
+  ret void
+}