[llvm] 21a69bd - [NewPM][AMDGPU] Port amdgpu-atomic-optimizer
Pravin Jagtap via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 19 21:31:44 PDT 2023
Author: Pravin Jagtap
Date: 2023-04-20T00:27:47-04:00
New Revision: 21a69bdb66e3d77b066a06dd4d4c3ff8a2dd8daa
URL: https://github.com/llvm/llvm-project/commit/21a69bdb66e3d77b066a06dd4d4c3ff8a2dd8daa
DIFF: https://github.com/llvm/llvm-project/commit/21a69bdb66e3d77b066a06dd4d4c3ff8a2dd8daa.diff
LOG: [NewPM][AMDGPU] Port amdgpu-atomic-optimizer
Reviewed By: arsenm, sameerds, gandhi21299
Differential Revision: https://reviews.llvm.org/D148628
Added:
llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPU.h
llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 10a7f6cc35c11..b1a94b78ec618 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -233,6 +233,14 @@ struct AMDGPUPromoteAllocaToVectorPass
TargetMachine &TM;
};
+struct AMDGPUAtomicOptimizerPass : PassInfoMixin<AMDGPUAtomicOptimizerPass> {
+ AMDGPUAtomicOptimizerPass(TargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ TargetMachine &TM;
+};
+
Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
CodeGenOpt::Level OptLevel);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index e4ecfc710d6d8..13eaa7be1abe3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -38,8 +38,23 @@ struct ReplacementInfo {
bool ValDivergent;
};
-class AMDGPUAtomicOptimizer : public FunctionPass,
- public InstVisitor<AMDGPUAtomicOptimizer> {
+class AMDGPUAtomicOptimizer : public FunctionPass {
+public:
+ static char ID;
+
+ AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<UniformityInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+
+class AMDGPUAtomicOptimizerImpl
+ : public InstVisitor<AMDGPUAtomicOptimizerImpl> {
private:
SmallVector<ReplacementInfo, 8> ToReplace;
const UniformityInfo *UA;
@@ -53,21 +68,19 @@ class AMDGPUAtomicOptimizer : public FunctionPass,
Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
Value *const Identity) const;
Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
+
void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
bool ValDivergent) const;
public:
- static char ID;
-
- AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
+ AMDGPUAtomicOptimizerImpl() = delete;
- bool runOnFunction(Function &F) override;
+ AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
+ DominatorTree *DT, const GCNSubtarget *ST,
+ bool IsPixelShader)
+ : UA(UA), DL(DL), DT(DT), ST(ST), IsPixelShader(IsPixelShader) {}
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<UniformityInfoWrapperPass>();
- AU.addRequired<TargetPassConfig>();
- }
+ bool run(Function &F);
void visitAtomicRMWInst(AtomicRMWInst &I);
void visitIntrinsicInst(IntrinsicInst &I);
@@ -84,15 +97,40 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
return false;
}
- UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
- DL = &F.getParent()->getDataLayout();
+ const UniformityInfo *UA =
+ &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ const DataLayout *DL = &F.getParent()->getDataLayout();
+
DominatorTreeWrapperPass *const DTW =
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- DT = DTW ? &DTW->getDomTree() : nullptr;
+ DominatorTree *DT = DTW ? &DTW->getDomTree() : nullptr;
+
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
- ST = &TM.getSubtarget<GCNSubtarget>(F);
- IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+ const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
+
+ bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+
+ return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F);
+}
+
+PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+
+ const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
+ const DataLayout *DL = &F.getParent()->getDataLayout();
+
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
+
+ bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+
+ return AMDGPUAtomicOptimizerImpl(UA, DL, DT, ST, IsPixelShader).run(F)
+ ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
+
+bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
visit(F);
@@ -107,7 +145,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
return Changed;
}
-void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
+void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
// Early exit for unhandled address space atomic instructions.
switch (I.getPointerAddressSpace()) {
default:
@@ -162,7 +200,7 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
ToReplace.push_back(Info);
}
-void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
+void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
AtomicRMWInst::BinOp Op;
switch (I.getIntrinsicID()) {
@@ -283,9 +321,10 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
// Use the builder to create a reduction of V across the wavefront, with all
// lanes active, returning the same result in all lanes.
-Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
- AtomicRMWInst::BinOp Op, Value *V,
- Value *const Identity) const {
+Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
+ AtomicRMWInst::BinOp Op,
+ Value *V,
+ Value *const Identity) const {
Type *const Ty = V->getType();
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
@@ -328,8 +367,9 @@ Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
// Use the builder to create an inclusive scan of V across the wavefront, with
// all lanes active.
-Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
- Value *V, Value *const Identity) const {
+Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
+ AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const {
Type *const Ty = V->getType();
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
@@ -385,8 +425,8 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
// Use the builder to create a shift right of V across the wavefront, with all
// lanes active, to turn an inclusive scan into an exclusive scan.
-Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
- Value *const Identity) const {
+Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
+ Value *const Identity) const {
Type *const Ty = V->getType();
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
@@ -456,10 +496,10 @@ static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {
return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
}
-void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
- AtomicRMWInst::BinOp Op,
- unsigned ValIdx,
- bool ValDivergent) const {
+void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
+ AtomicRMWInst::BinOp Op,
+ unsigned ValIdx,
+ bool ValDivergent) const {
// Start building just before the instruction.
IRBuilder<> B(&I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 410cd4e803b28..94c3cc30a11e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -650,6 +650,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
return true;
}
+ if (PassName == "amdgpu-atomic-optimizer") {
+ PM.addPass(AMDGPUAtomicOptimizerPass(*this));
+ return true;
+ }
return false;
});
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
new file mode 100644
index 0000000000000..3a63629bc7705
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
@@ -0,0 +1,1440 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-atomic-optimizer %s | FileCheck -check-prefix=IR %s
+
+define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_add_i32_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_add_i32_max_neg_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 -1024
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
+ %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_add_i32_soffset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 9000
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
+ %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_add_i32_huge_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 47224239175595
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
+
+ %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_add_i32_ret_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
+; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
+; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_add_i32_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_add_i32_ret_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
+; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
+; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_add_i32(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_add_i32_ret(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
+; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
+; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_add_i32_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_add_i32_ret_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
+; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
+; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_and_i32_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_and_i32_ret_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_and_i32_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_and_i32_ret_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_and_i32(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_and_i32_ret(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_and_i32_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_and_i32_ret_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_sub_i32_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_sub_i32_ret_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
+; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_sub_i32_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_sub_i32_ret_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
+; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_sub_i32(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_sub_i32_ret(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
+; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_sub_i32_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_sub_i32_ret_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
+; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
+; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
+; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
+; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_max_i32_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_max_i32_ret_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_max_i32_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_max_i32_ret_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_max_i32(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_max_i32_ret(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_max_i32_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_max_i32_ret_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_umax_i32_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_umax_i32_ret_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_umax_i32_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_umax_i32_ret_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_umax_i32(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_umax_i32_ret(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_umax_i32_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_umax_i32_ret_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_min_i32_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_min_i32_ret_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
+ %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_min_i32_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_min_i32_ret_addr64_offset(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
+ %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
+; IR-LABEL: @atomic_min_i32(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
+; IR-LABEL: @atomic_min_i32_ret(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_min_i32_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
+; IR-LABEL: @atomic_min_i32_ret_addr64(
+; IR-NEXT: entry:
+; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
+; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; IR-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP2]], i32 0)
+; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
+; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
+; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; IR: 7:
+; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
+; IR-NEXT: br label [[TMP9]]
+; IR: 9:
+; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
+; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
+; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
+; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
+; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
+; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
+; IR-NEXT: ret void
+;
+entry:
+ %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
+ %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
+ store i32 %val, ptr addrspace(1) %out2
+ ret void
+}
More information about the llvm-commits
mailing list