[llvm] [LSV] Prepare atomicrmw operands for vectorization (PR #114702)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 3 02:48:31 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Anshil Gandhi (gandhi56)
<details>
<summary>Changes</summary>
Insert necessary casts and inttoptr instructions
to support vectorization of atomicrmw operands.
---
Patch is 45.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114702.diff
3 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp (+91)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+238-306)
- (added) llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/fp-atomics.ll (+50)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 02ec1d5c259cd6..8767141c5d764c 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -86,6 +86,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
@@ -338,6 +339,10 @@ class Vectorizer {
/// Postcondition: For all i, ret[i][0].second == 0, because the first instr
/// in the chain is the leader, and an instr touches distance 0 from itself.
std::vector<Chain> gatherChains(ArrayRef<Instruction *> Instrs);
+
+ /// Attempts to prepare atomic read-modify-write instructions for
+ /// vectorization.
+ bool prepareAtomicInstOps(AtomicRMWInst *);
};
class LoadStoreVectorizerLegacyPass : public FunctionPass {
@@ -419,8 +424,94 @@ PreservedAnalyses LoadStoreVectorizerPass::run(Function &F,
return Changed ? PA : PreservedAnalyses::all();
}
+/**
+ * @brief Prepare operands of atomicrmw instructions for vectorization.
+ *
+ * Ensures the given AtomicRMWInst's pointer and value operands meet type
+ * requirements and are load instructions. Inserts necessary bitcast and
+ * inttoptr instructions.
+ *
+ * @param AI Pointer to the AtomicRMWInst in question.
+ * @return true if the operands were successfully prepared, false otherwise.
+ */
+bool Vectorizer::prepareAtomicInstOps(AtomicRMWInst *AI) {
+ if (AI->isVolatile() || AI->hasMetadata("amdgpu.no.fine.grained.memory"))
+ return false;
+
+ auto *Ptr = AI->getPointerOperand();
+ auto *PtrTy = Ptr->getType();
+ auto *Val = AI->getValOperand();
+ auto *ValTy = Val->getType();
+
+ if (!PtrTy->isPointerTy())
+ return false;
+
+ // Only cast if the value operand type is
+ // <2 x half>, <2 x i16>, <4 x i8>, f32, or i32
+ bool ValTyIsOkay = false;
+ if (auto *VTy = dyn_cast<FixedVectorType>(ValTy)) {
+ if (VTy->getNumElements() == 2) {
+ if (VTy->getElementType()->isHalfTy())
+ ValTyIsOkay = true;
+ if (VTy->getElementType()->isIntegerTy(16))
+ ValTyIsOkay = true;
+ } else if (VTy->getNumElements() == 4 &&
+ VTy->getElementType()->isIntegerTy(8)) {
+ ValTyIsOkay = true;
+ }
+ } else {
+ if (ValTy->isFloatTy())
+ ValTyIsOkay = true;
+ if (ValTy->isIntegerTy(32))
+ ValTyIsOkay = true;
+ }
+ if (!ValTyIsOkay)
+ return false;
+
+ // Walk up the chain of instructions to find the load instruction
+ auto GetLoadInst = [](Value *Ptr) -> LoadInst * {
+ while (Ptr) {
+ if (!isa<Instruction>(Ptr))
+ return nullptr;
+ if (auto *LI = dyn_cast<LoadInst>(Ptr))
+ return LI;
+ if (isa<GetElementPtrInst>(Ptr))
+ return nullptr;
+ if (auto *PtrInst = dyn_cast<Instruction>(Ptr))
+ Ptr = PtrInst->getOperand(0);
+ else
+ return nullptr;
+ }
+ return nullptr;
+ };
+
+ // Pointer and value operands must be load instructions to be vectorized
+ auto *ValLoadInst = GetLoadInst(Val);
+ auto *PtrLoadInst = GetLoadInst(Ptr);
+ if (!ValLoadInst || !PtrLoadInst)
+ return false;
+
+ // Insert bitcast to replace atomicrmw value operand
+ IRBuilder<> Builder(AI->getParent(), AI->getIterator());
+ ValLoadInst->mutateType(IntegerType::getInt32Ty(AI->getContext()));
+ AI->setOperand(1, Builder.CreateBitCast(ValLoadInst, ValTy,
+ ValLoadInst->getName() + ".bitcast"));
+
+ // Insert inttoptr to replace atomicrmw pointer operand
+ PtrLoadInst->mutateType(IntegerType::getInt32Ty(AI->getContext()));
+ AI->setOperand(0,
+ Builder.CreateIntToPtr(PtrLoadInst, PtrTy,
+ PtrLoadInst->getName() + ".inttoptr"));
+ return true;
+}
+
bool Vectorizer::run() {
bool Changed = false;
+
+ for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+ if (auto *AI = dyn_cast<AtomicRMWInst>(&*I))
+ Changed |= prepareAtomicInstOps(AI);
+
// Break up the BB if there are any instrs which aren't guaranteed to transfer
// execution to their successor.
//
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 495bfec5454ee8..4c6d0559593820 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -337,245 +337,222 @@ entry:
define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(1) %inout, i32 %additive) {
; GFX7LESS-LABEL: add_i32_uniform:
; GFX7LESS: ; %bb.0: ; %entry
-; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec
-; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
-; GFX7LESS-NEXT: s_load_dword s2, s[2:3], 0xd
-; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0
-; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0
+; GFX7LESS-NEXT: s_load_dword s4, s[2:3], 0xd
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7LESS-NEXT: ; implicit-def: $vgpr1
; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2
; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dword s8, s[2:3], 0xb
+; GFX7LESS-NEXT: s_mov_b32 s9, 0
; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000
-; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[8:9]
-; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3
; GFX7LESS-NEXT: s_mov_b32 s10, -1
-; GFX7LESS-NEXT: s_mov_b32 s8, s6
-; GFX7LESS-NEXT: s_mov_b32 s9, s7
-; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4
; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
; GFX7LESS-NEXT: buffer_wbinvl1
; GFX7LESS-NEXT: .LBB1_2:
; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000
-; GFX7LESS-NEXT: s_mov_b32 s6, -1
-; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1
-; GFX7LESS-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0
-; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX7LESS-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s5, v0
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
; GFX8-LABEL: add_i32_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX8-NEXT: s_load_dword s8, s[2:3], 0x34
-; GFX8-NEXT: s_mov_b64 s[0:1], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX8-NEXT: s_load_dword s4, s[2:3], 0x34
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB1_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX8-NEXT: s_load_dword s8, s[2:3], 0x2c
+; GFX8-NEXT: s_mov_b32 s9, 0
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mul_i32 s0, s8, s0
-; GFX8-NEXT: s_mov_b32 s15, 0xf000
-; GFX8-NEXT: s_mov_b32 s14, -1
-; GFX8-NEXT: s_mov_b32 s12, s6
-; GFX8-NEXT: s_mov_b32 s13, s7
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
-; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
+; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB1_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v1
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX9-NEXT: s_load_dword s8, s[2:3], 0x34
-; GFX9-NEXT: s_mov_b64 s[0:1], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x34
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB1_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT: s_load_dword s8, s[2:3], 0x2c
+; GFX9-NEXT: s_mov_b32 s9, 0
+; GFX9-NEXT: s_mov_b32 s11, 0xf000
+; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s0, s8, s0
-; GFX9-NEXT: s_mov_b32 s15, 0xf000
-; GFX9-NEXT: s_mov_b32 s14, -1
-; GFX9-NEXT: s_mov_b32 s12, s6
-; GFX9-NEXT: s_mov_b32 s13, s7
-; GFX9-NEXT: v_mov_b32_e32 v1, s0
-; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB1_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v1
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_mul_lo_u32 v0, s4, v0
+; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s4, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_clause 0x1
-; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
-; GFX1064-NEXT: s_load_dword s10, s[2:3], 0x34
-; GFX1064-NEXT: s_mov_b64 s[8:9], exec
+; GFX1064-NEXT: s_load_dword s4, s[2:3], 0x34
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB1_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
-; GFX1064-NEXT: s_mov_b32 s15, 0x31016000
+; GFX1064-NEXT: s_load_dword s8, s[2:3], 0x2c
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mul_i32 s2, s10, s2
-; GFX1064-NEXT: s_mov_b32 s14, -1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b32 s12, s6
-; GFX1064-NEXT: s_mov_b32 s13, s7
-; GFX1064-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc
+; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: s_mov_b32 s9, 0
+; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB1_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s6, -1
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v0, s[0:1]
-; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s4, v0, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_clause 0x1
-; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x34
-; GFX1032-NEXT: s_mov_b32 s8, exec_lo
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB1_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s8
-; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1032-NEXT: s_load_dword s4, s[2:3], 0x2c
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s2, s0, s2
-; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s2
-; GFX1032-NEXT: s_mov_b32 s8, s6
-; GFX1032-NEXT: s_mov_b32 s9, s7
-; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: s_mov_b32 s5, 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB1_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s0, v0, s[2:3]
; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_clause 0x1
-; GFX1164-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1164-NEXT: s_load_b32 s2, s[2:3], 0x34
-; GFX1164-NEXT: s_mov_b64 s[8:9], exec
+; GFX1164-NEXT: s_load_b32 s4, s[2:3], 0x34
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_mov_b64 s[0:1], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB1_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1164-NEXT: s_load_b32 s8, s[2:3], 0x2c
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s3, s2, s3
+; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: s_mov_b32 s9, 0
+; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s3
-; GFX1164-NEXT: s_mov_b32 s8, s6
-; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB1_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
-; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0
+; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s4, v0, s[2:3]
+; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_clause 0x1
-; GFX1132-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x34
-; GFX1132-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_mov_b32 s1, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB1_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s8
-; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
+; GFX1132-NEXT: s_load_b32 s4, s[2:3], 0x2c
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s2, s0, s2
-; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_mov_b32_e32 v1, s2
-; GFX1132-NEXT: s_mov_b32 s8, s6
-; GFX1132-NEXT: s_mov_b32 s9, s7
-; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
+; GFX1132-NEXT: s_mov_b32 s5, 0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB1_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[2:3], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
@@ -583,72 +560,62 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
;
; GFX1264-LABEL: add_i32_uniform:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_clause 0x1
-; GFX1264-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
-; GFX1264-NEXT: s_load_b32 s2, s[2:3], 0x34
-; GFX1264-NEXT: s_mov_b64 s[8:9], exec
+; GFX1264-NEXT: s_load_b32 s4, s[2:3], 0x34
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1264-NEXT: s_mov_b64 s[0:1], exec
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
-; GFX1264-NEXT: s_wait_alu 0xfffe
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1264-NEXT: s_cbranch_execz .LBB1_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s3, s[8:9]
-; GFX1264-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/114702
More information about the llvm-commits
mailing list