[llvm] [LSV] Prepare atomicrmw operands for vectorization (PR #114702)
Anshil Gandhi via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 3 02:47:59 PST 2024
https://github.com/gandhi56 created https://github.com/llvm/llvm-project/pull/114702
Insert necessary casts and inttoptr instructions
to support vectorization of atomicrmw operands.
Change-Id: I8c3b7840ca5029b3c840359c44654ebec133fd2c
>From fd0fb2c10a504f4e1b41f5ecf1c7ba962de49f64 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <Anshil.Gandhi at amd.com>
Date: Mon, 28 Oct 2024 02:48:30 +0000
Subject: [PATCH] [LSV] Prepare atomicrmw operands for vectorization
Insert necessary casts and inttoptr instructions
to support vectorization of atomicrmw operands.
Change-Id: I8c3b7840ca5029b3c840359c44654ebec133fd2c
---
.../Vectorize/LoadStoreVectorizer.cpp | 91 +++
.../atomic_optimizations_global_pointer.ll | 544 ++++++++----------
.../LoadStoreVectorizer/AMDGPU/fp-atomics.ll | 50 ++
3 files changed, 379 insertions(+), 306 deletions(-)
create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/fp-atomics.ll
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 02ec1d5c259cd6..8767141c5d764c 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -86,6 +86,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
@@ -338,6 +339,10 @@ class Vectorizer {
/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
/// in the chain is the leader, and an instr touches distance 0 from itself.
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
+
+ /// Attempts to prepare atomic read-modify-write instructions for
+ /// vectorization.
+ bool prepareAtomicInstOps(AtomicRMWInst *);
};
class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -419,8 +424,94 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
return Changed ? PA : PreservedAnalyses::all();
}
+/**
+ * @brief Prepare operands of atomicrmw instructions for vectorization.
+ *
+ * Ensures the given AtomicRMWInst's pointer and value operands meet type
+ * requirements and are load instructions. Inserts necessary bitcast and
+ * inttoptr instructions.
+ *
+ * @param AI Pointer to the AtomicRMWInst in question.
+ * @return true if the operands were successfully prepared, false otherwise.
+ */
+bool Vectorizer::prepareAtomicInstOps(AtomicRMWInst *AI) {
+ if (AI->isVolatile() || AI->hasMetadata("amdgpu.no.fine.grained.memory"))
+ return false;
+
+ auto *Ptr = AI->getPointerOperand();
+ auto *PtrTy = Ptr->getType();
+ auto *Val = AI->getValOperand();
+ auto *ValTy = Val->getType();
+
+ if (!PtrTy->isPointerTy())
+ return false;
+
+ // Only cast if the value operand type is
+ // <2 x half>, <2 x i16>, <4 x i8>, f32, or i32
+ bool ValTyIsOkay = false;
+ if (auto *VTy = dyn_cast<FixedVectorType>(ValTy)) {
+ if (VTy->getNumElements() == 2) {
+ if (VTy->getElementType()->isHalfTy())
+ ValTyIsOkay = true;
+ if (VTy->getElementType()->isIntegerTy(16))
+ ValTyIsOkay = true;
+ } else if (VTy->getNumElements() == 4 &&
+ VTy->getElementType()->isIntegerTy(8)) {
+ ValTyIsOkay = true;
+ }
+ } else {
+ if (ValTy->isFloatTy())
+ ValTyIsOkay = true;
+ if (ValTy->isIntegerTy(32))
+ ValTyIsOkay = true;
+ }
+ if (!ValTyIsOkay)
+ return false;
+
+ // Walk up the chain of instructions to find the load instruction
+ auto GetLoadInst = [](Value *Ptr) -> LoadInst * {
+ while (Ptr) {
+ if (!isa<Instruction>(Ptr))
+ return nullptr;
+ if (auto *LI = dyn_cast<LoadInst>(Ptr))
+ return LI;
+ if (isa<GetElementPtrInst>(Ptr))
+ return nullptr;
+ if (auto *PtrInst = dyn_cast<Instruction>(Ptr))
+ Ptr = PtrInst->getOperand(0);
+ else
+ return nullptr;
+ }
+ return nullptr;
+ };
+
+ // Pointer and value operands must be load instructions to be vectorized
+ auto *ValLoadInst = GetLoadInst(Val);
+ auto *PtrLoadInst = GetLoadInst(Ptr);
+ if (!ValLoadInst || !PtrLoadInst)
+ return false;
+
+ // Insert bitcast to replace atomicrmw value operand
+ IRBuilder<> Builder(AI->getParent(), AI->getIterator());
+ ValLoadInst->mutateType(IntegerType::getInt32Ty(AI->getContext()));
+ AI->setOperand(1, Builder.CreateBitCast(ValLoadInst, ValTy,
+ ValLoadInst->getName() + ".bitcast"));
+
+ // Insert inttoptr to replace atomicrmw pointer operand
+ PtrLoadInst->mutateType(IntegerType::getInt32Ty(AI->getContext()));
+ AI->setOperand(0,
+ Builder.CreateIntToPtr(PtrLoadInst, PtrTy,
+ PtrLoadInst->getName() + ".inttoptr"));
+ return true;
+}
+
bool Vectorizer::run() {
bool Changed = false;
+
+ for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+ if (auto *AI = dyn_cast<AtomicRMWInst>(&*I))
+ Changed |= prepareAtomicInstOps(AI);
+
// Break up the BB if there are any instrs which aren't guaranteed to transfer
// execution to their successor.
//
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 495bfec5454ee8..4c6d0559593820 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -337,245 +337,222 @@ entry:
define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) {
; GFX7LESS-LABEL: add_i32_uniform:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec
-; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
-; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0
+; GFX7LESS-NEXT: s_load_dword s4, s[2:3], 0xd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2
; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dword s8, s[2:3], 0xb
+; GFX7LESS-NEXT: s_mov_b32 s9, 0
; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9]
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3
; GFX7LESS-NEXT: s_mov_b32 s10, -1
-; GFX7LESS-NEXT: s_mov_b32 s8, s6
-; GFX7LESS-NEXT: s_mov_b32 s9, s7
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4
; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: buffer_wbinvl1
; GFX7LESS-NEXT: .LBB1_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1
-; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: add_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34
-; GFX8-NEXT: s_mov_b64 s[0:1], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX8-NEXT: s_load_dword s4, s[2:3], 0x34
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB1_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8-NEXT: s_load_dword s8, s[2:3], 0x2c
+; GFX8-NEXT: s_mov_b32 s9, 0
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mul_i32 s0, s8, s0
-; GFX8-NEXT: s_mov_b32 s15, 0xf000
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s12, s6
-; GFX8-NEXT: s_mov_b32 s13, s7
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB1_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34
-; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x34
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB1_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT: s_load_dword s8, s[2:3], 0x2c
+; GFX9-NEXT: s_mov_b32 s9, 0
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s0, s8, s0
-; GFX9-NEXT: s_mov_b32 s15, 0xf000
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s12, s6
-; GFX9-NEXT: s_mov_b32 s13, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB1_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34
-; GFX1064-NEXT: s_mov_b64 s[8:9], exec
+; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x34
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB1_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
-; GFX1064-NEXT: s_mov_b32 s15, 0x31016000
+; GFX1064-NEXT: s_load_dword s8, s[2:3], 0x2c
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mul_i32 s2, s10, s2
-; GFX1064-NEXT: s_mov_b32 s14, -1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
-; GFX1064-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: s_mov_b32 s9, 0
+; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB1_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s6, -1
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v0, s[0:1]
-; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v0, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34
-; GFX1032-NEXT: s_mov_b32 s8, exec_lo
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB1_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8
-; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s2, s0, s2
-; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-NEXT: s_mov_b32 s8, s6
-; GFX1032-NEXT: s_mov_b32 s9, s7
-; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: s_mov_b32 s5, 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB1_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34
-; GFX1164-NEXT: s_mov_b64 s[8:9], exec
+; GFX1164-NEXT: s_load_b32 s4, s[2:3], 0x34
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT: s_load_b32 s8, s[2:3], 0x2c
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s3, s2, s3
+; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: s_mov_b32 s9, 0
+; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s3
-; GFX1164-NEXT: s_mov_b32 s8, s6
-; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB1_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
-; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0
+; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[2:3]
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34
-; GFX1132-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_mov_b32 s1, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8
-; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s2, s0, s2
-; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-NEXT: s_mov_b32 s8, s6
-; GFX1132-NEXT: s_mov_b32 s9, s7
-; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
+; GFX1132-NEXT: s_mov_b32 s5, 0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB1_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
@@ -583,72 +560,62 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1264-LABEL: add_i32_uniform:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_clause 0x1
-; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34
-; GFX1264-NEXT: s_mov_b64 s[8:9], exec
+; GFX1264-NEXT: s_load_b32 s4, s[2:3], 0x34
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1264-NEXT: s_mov_b64 s[0:1], exec
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
-; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1264-NEXT: s_cbranch_execz .LBB1_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_load_b32 s8, s[2:3], 0x2c
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mul_i32 s3, s2, s3
+; GFX1264-NEXT: v_mov_b32_e32 v1, s4
+; GFX1264-NEXT: s_mov_b32 s9, 0
+; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v1, s3
-; GFX1264-NEXT: s_mov_b32 s8, s6
-; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB1_2:
; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v0, s[2:3]
+; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s2, -1
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: add_i32_uniform:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_clause 0x1
-; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34
-; GFX1232-NEXT: s_mov_b32 s8, exec_lo
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232-NEXT: s_mov_b32 s1, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB1_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_wait_alu 0xfffe
-; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8
-; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mul_i32 s2, s0, s2
-; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_mov_b32_e32 v1, s2
-; GFX1232-NEXT: s_mov_b32 s8, s6
-; GFX1232-NEXT: s_mov_b32 s9, s7
-; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1232-NEXT: v_mov_b32_e32 v1, s0
+; GFX1232-NEXT: s_mov_b32 s5, 0
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
+; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB1_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1232-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: s_mov_b32 s6, -1
+; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[2:3]
; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
@@ -3918,170 +3885,153 @@ entry:
define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %subitive) {
; GFX7LESS-LABEL: sub_i32_uniform:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec
-; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
-; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0
+; GFX7LESS-NEXT: s_load_dword s4, s[2:3], 0xd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2
; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dword s8, s[2:3], 0xb
+; GFX7LESS-NEXT: s_mov_b32 s9, 0
; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9]
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3
; GFX7LESS-NEXT: s_mov_b32 s10, -1
-; GFX7LESS-NEXT: s_mov_b32 s8, s6
-; GFX7LESS-NEXT: s_mov_b32 s9, s7
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4
; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: buffer_wbinvl1
; GFX7LESS-NEXT: .LBB7_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1
-; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s5, v0
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: sub_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34
-; GFX8-NEXT: s_mov_b64 s[0:1], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX8-NEXT: s_load_dword s4, s[2:3], 0x34
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB7_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8-NEXT: s_load_dword s8, s[2:3], 0x2c
+; GFX8-NEXT: s_mov_b32 s9, 0
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mul_i32 s0, s8, s0
-; GFX8-NEXT: s_mov_b32 s15, 0xf000
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s12, s6
-; GFX8-NEXT: s_mov_b32 s13, s7
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB7_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34
-; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x34
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB7_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT: s_load_dword s8, s[2:3], 0x2c
+; GFX9-NEXT: s_mov_b32 s9, 0
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s0, s8, s0
-; GFX9-NEXT: s_mov_b32 s15, 0xf000
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s12, s6
-; GFX9-NEXT: s_mov_b32 s13, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB7_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34
-; GFX1064-NEXT: s_mov_b64 s[8:9], exec
+; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x34
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB7_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
-; GFX1064-NEXT: s_mov_b32 s15, 0x31016000
+; GFX1064-NEXT: s_load_dword s8, s[2:3], 0x2c
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mul_i32 s2, s10, s2
-; GFX1064-NEXT: s_mov_b32 s14, -1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
-; GFX1064-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc
+; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: s_mov_b32 s9, 0
+; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB7_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mul_lo_u32 v0, s10, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s6, -1
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX1064-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34
-; GFX1032-NEXT: s_mov_b32 s8, exec_lo
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8
-; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s2, s0, s2
-; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-NEXT: s_mov_b32 s8, s6
-; GFX1032-NEXT: s_mov_b32 s9, s7
-; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: s_mov_b32 s5, 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: buffer_atomic_sub v1, off, s[4:7], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB7_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
@@ -4093,69 +4043,61 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1164-LABEL: sub_i32_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34
-; GFX1164-NEXT: s_mov_b64 s[8:9], exec
+; GFX1164-NEXT: s_load_b32 s4, s[2:3], 0x34
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB7_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT: s_load_b32 s8, s[2:3], 0x2c
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s3, s2, s3
+; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: s_mov_b32 s9, 0
+; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s3
-; GFX1164-NEXT: s_mov_b32 s8, s6
-; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB7_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i32_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34
-; GFX1132-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_mov_b32 s1, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB7_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8
-; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s2, s0, s2
-; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-NEXT: s_mov_b32 s8, s6
-; GFX1132-NEXT: s_mov_b32 s9, s7
-; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
+; GFX1132-NEXT: s_mov_b32 s5, 0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB7_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
@@ -4168,69 +4110,59 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1264-LABEL: sub_i32_uniform:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_clause 0x1
-; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34
-; GFX1264-NEXT: s_mov_b64 s[8:9], exec
+; GFX1264-NEXT: s_load_b32 s4, s[2:3], 0x34
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1264-NEXT: s_mov_b64 s[0:1], exec
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
-; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1264-NEXT: s_cbranch_execz .LBB7_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1264-NEXT: s_load_b32 s8, s[2:3], 0x2c
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mul_i32 s3, s2, s3
+; GFX1264-NEXT: v_mov_b32_e32 v1, s4
+; GFX1264-NEXT: s_mov_b32 s9, 0
+; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v1, s3
-; GFX1264-NEXT: s_mov_b32 s8, s6
-; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB7_2:
; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
-; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s6, -1
+; GFX1264-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX1264-NEXT: s_mov_b32 s2, -1
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: sub_i32_uniform:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_clause 0x1
-; GFX1232-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX1232-NEXT: s_load_b32 s0, s[2:3], 0x34
-; GFX1232-NEXT: s_mov_b32 s8, exec_lo
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232-NEXT: s_mov_b32 s1, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB7_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_wait_alu 0xfffe
-; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s8
-; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1232-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mul_i32 s2, s0, s2
-; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_mov_b32_e32 v1, s2
-; GFX1232-NEXT: s_mov_b32 s8, s6
-; GFX1232-NEXT: s_mov_b32 s9, s7
-; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1232-NEXT: v_mov_b32_e32 v1, s0
+; GFX1232-NEXT: s_mov_b32 s5, 0
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
+; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB7_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1232-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX1232-NEXT: v_readfirstlane_b32 s0, v1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/fp-atomics.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/fp-atomics.ll
new file mode 100644
index 00000000000000..90480a83db48e0
--- /dev/null
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/fp-atomics.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=load-store-vectorizer -S < %s | FileCheck %s
+
+; Test atomicrmw fadd with <2 x half> data type.
+define amdgpu_kernel void @func1(ptr addrspace(3) %ptr, <2 x half> %data) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @func1(
+; CHECK-SAME: ptr addrspace(3) [[PTR:%.*]], <2 x half> [[DATA:%.*]]) {
+; CHECK-NEXT: [[FUNC1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; CHECK-NEXT: [[PTR_KERNARG_OFFSET1:%.*]] = bitcast ptr addrspace(4) [[FUNC1_KERNARG_SEGMENT]] to ptr addrspace(4)
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(4) [[PTR_KERNARG_OFFSET1]], align 16
+; CHECK-NEXT: [[PTR_LOAD1:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[DATA_LOAD2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[FOO1_BITCAST:%.*]] = bitcast i32 [[DATA_LOAD2]] to <2 x half>
+; CHECK-NEXT: [[FOO1_INTTOPTR:%.*]] = inttoptr i32 [[PTR_LOAD1]] to ptr addrspace(3)
+; CHECK-NEXT: [[I1:%.*]] = atomicrmw fadd ptr addrspace(3) [[FOO1_INTTOPTR]], <2 x half> [[FOO1_BITCAST]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT: ret void
+;
+ %func1.kernarg.segment = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %ptr.kernarg.offset1 = bitcast ptr addrspace(4) %func1.kernarg.segment to ptr addrspace(4)
+ %ptr.load = load ptr addrspace(3), ptr addrspace(4) %ptr.kernarg.offset1, align 16
+ %data.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %func1.kernarg.segment, i64 4
+ %data.load = load <2 x half>, ptr addrspace(4) %data.kernarg.offset, align 4
+ %i1 = atomicrmw fadd ptr addrspace(3) %ptr.load, <2 x half> %data.load syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+; Test atomicrmw fadd with <2 x half> data type and an i32 pointer.
+define amdgpu_kernel void @func2(i32 %ptr.as.int, <2 x half> %data) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @func2(
+; CHECK-SAME: i32 [[PTR_AS_INT:%.*]], <2 x half> [[DATA:%.*]]) {
+; CHECK-NEXT: [[FUNC2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+; CHECK-NEXT: [[PTR_AS_INT_KERNARG_OFFSET1:%.*]] = bitcast ptr addrspace(4) [[FUNC2_KERNARG_SEGMENT]] to ptr addrspace(4)
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(4) [[PTR_AS_INT_KERNARG_OFFSET1]], align 16
+; CHECK-NEXT: [[PTR_AS_INT_LOAD:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT: [[DATA_LOAD2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT: [[PTR:%.*]] = inttoptr i32 [[PTR_AS_INT_LOAD]] to ptr addrspace(3)
+; CHECK-NEXT: [[DATA_LOAD_BITCAST:%.*]] = bitcast i32 [[DATA_LOAD2]] to <2 x half>
+; CHECK-NEXT: [[PTR_AS_INT_LOAD_INTTOPTR:%.*]] = inttoptr i32 [[PTR_AS_INT_LOAD]] to ptr addrspace(3)
+; CHECK-NEXT: [[I1:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR_AS_INT_LOAD_INTTOPTR]], <2 x half> [[DATA_LOAD_BITCAST]] syncscope("agent") seq_cst, align 4
+; CHECK-NEXT: ret void
+;
+ %func2.kernarg.segment = call nonnull align 16 dereferenceable(264) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+ %ptr.as.int.kernarg.offset1 = bitcast ptr addrspace(4) %func2.kernarg.segment to ptr addrspace(4)
+ %ptr.as.int.load = load i32, ptr addrspace(4) %ptr.as.int.kernarg.offset1, align 16
+ %data.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %func2.kernarg.segment, i64 4
+ %data.load = load <2 x half>, ptr addrspace(4) %data.kernarg.offset, align 4
+ %ptr = inttoptr i32 %ptr.as.int.load to ptr addrspace(3)
+ %i1 = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %data.load syncscope("agent") seq_cst, align 4
+ ret void
+}
More information about the llvm-commits
mailing list