[llvm] 92ee60b - AMDGPU: Drop and upgrade llvm.amdgcn.atomic.inc/dec to atomicrmw
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 21 18:20:32 PDT 2023
Author: Matt Arsenault
Date: 2023-06-21T21:20:26-04:00
New Revision: 92ee60b66f581fdd919315da5c6ae631e581b021
URL: https://github.com/llvm/llvm-project/commit/92ee60b66f581fdd919315da5c6ae631e581b021
DIFF: https://github.com/llvm/llvm-project/commit/92ee60b66f581fdd919315da5c6ae631e581b021.diff
LOG: AMDGPU: Drop and upgrade llvm.amdgcn.atomic.inc/dec to atomicrmw
Added:
llvm/test/Bitcode/amdgcn-atomic.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
Modified:
llvm/docs/ReleaseNotes.rst
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/include/llvm/Support/AtomicOrdering.h
llvm/lib/IR/AutoUpgrade.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
llvm/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll
llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
Removed:
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
################################################################################
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 5834b7f82a386..53c2f3960a0b8 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -133,6 +133,10 @@ Changes to the AMDGPU Backend
improves the interaction between AMDGPU buffer operations and the LLVM memory
model, and so the non `.ptr` intrinsics are deprecated.
+* Removed ``llvm.amdgcn.atomic.inc`` and ``llvm.amdgcn.atomic.dec``
+ intrinsics. :ref:`atomicrmw <i_atomicrmw>` should be used instead
+ with ``uinc_wrap`` and ``udec_wrap``.
+
* Added llvm.amdgcn.log.f32 intrinsic. This provides direct access to
v_log_f32.
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8c0f25b088787..f6f9adda49f89 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -461,21 +461,6 @@ def int_amdgcn_fmad_ftz :
[IntrNoMem, IntrSpeculatable]
>;
-// Fields should mirror atomicrmw
-class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
- [llvm_anyptr_ty,
- LLVMMatchType<0>,
- llvm_i32_ty, // ordering
- llvm_i32_ty, // scope
- llvm_i1_ty], // isVolatile
- [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>,
- ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], "",
- [SDNPMemOperand]
->;
-
-def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
-def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
-
class AMDGPULDSIntrin :
Intrinsic<[llvm_any_ty],
[LLVMQualPointerType<LLVMMatchType<0>, 3>,
diff --git a/llvm/include/llvm/Support/AtomicOrdering.h b/llvm/include/llvm/Support/AtomicOrdering.h
index 1a0d108300bc4..e08c1b262a92b 100644
--- a/llvm/include/llvm/Support/AtomicOrdering.h
+++ b/llvm/include/llvm/Support/AtomicOrdering.h
@@ -74,7 +74,8 @@ bool operator>=(AtomicOrdering, AtomicOrdering) = delete;
// is a valid AtomicOrdering.
template <typename Int> inline bool isValidAtomicOrdering(Int I) {
return static_cast<Int>(AtomicOrdering::NotAtomic) <= I &&
- I <= static_cast<Int>(AtomicOrdering::SequentiallyConsistent);
+ I <= static_cast<Int>(AtomicOrdering::SequentiallyConsistent) &&
+ I != 3;
}
/// String used by LLVM IR to represent atomic ordering.
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 19e69ef1a89fb..502e93183d50b 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -830,16 +830,25 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
Name == "arm.cde.vcx3qa.predicated.v2i64.v4i1")
return true;
- if (Name == "amdgcn.alignbit") {
+ if (Name.startswith("amdgcn."))
+ Name = Name.substr(7); // Strip off "amdgcn."
+
+ if (Name == "alignbit") {
// Target specific intrinsic became redundant
NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::fshr,
{F->getReturnType()});
return true;
}
+ if (Name.startswith("atomic.inc") || Name.startswith("atomic.dec")) {
+ // This was replaced with atomicrmw uinc_wrap and udec_wrap, so there's no
+ // new declaration.
+ NewFn = nullptr;
+ return true;
+ }
+
break;
}
-
case 'c': {
if (Name.startswith("ctlz.") && F->arg_size() == 1) {
rename(F);
@@ -2162,6 +2171,38 @@ static Value *UpgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
llvm_unreachable("Unknown function for ARM CallBase upgrade.");
}
+static Value *UpgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
+ Function *F, IRBuilder<> &Builder) {
+ const bool IsInc = Name.startswith("atomic.inc.");
+ if (IsInc || Name.startswith("atomic.dec.")) {
+ if (CI->getNumOperands() != 6) // Malformed bitcode.
+ return nullptr;
+
+ AtomicRMWInst::BinOp RMWOp =
+ IsInc ? AtomicRMWInst::UIncWrap : AtomicRMWInst::UDecWrap;
+
+ Value *Ptr = CI->getArgOperand(0);
+ Value *Val = CI->getArgOperand(1);
+ ConstantInt *OrderArg = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+ ConstantInt *VolatileArg = dyn_cast<ConstantInt>(CI->getArgOperand(4));
+
+ AtomicOrdering Order = AtomicOrdering::SequentiallyConsistent;
+ if (OrderArg && isValidAtomicOrdering(OrderArg->getZExtValue()))
+ Order = static_cast<AtomicOrdering>(OrderArg->getZExtValue());
+ if (Order == AtomicOrdering::NotAtomic ||
+ Order == AtomicOrdering::Unordered)
+ Order = AtomicOrdering::SequentiallyConsistent;
+
+ AtomicRMWInst *RMW = Builder.CreateAtomicRMW(RMWOp, Ptr, Val, std::nullopt, Order);
+
+ if (!VolatileArg || !VolatileArg->isZero())
+ RMW->setVolatile(true);
+ return RMW;
+ }
+
+ llvm_unreachable("Unknown function for AMDGPU intrinsic upgrade.");
+}
+
/// Upgrade a call to an old intrinsic. All argument and return casting must be
/// provided to seamlessly integrate with existing context.
void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
@@ -2192,6 +2233,9 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
bool IsARM = Name.startswith("arm.");
if (IsARM)
Name = Name.substr(4);
+ bool IsAMDGCN = Name.startswith("amdgcn.");
+ if (IsAMDGCN)
+ Name = Name.substr(7);
if (IsX86 && Name.startswith("sse4a.movnt.")) {
SmallVector<Metadata *, 1> Elts;
@@ -4011,6 +4055,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
CI->getArgOperand(0), "h2f");
} else if (IsARM) {
Rep = UpgradeARMIntrinsicCall(Name, CI, F, Builder);
+ } else if (IsAMDGCN) {
+ Rep = UpgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
} else {
llvm_unreachable("Unknown function for CallBase upgrade.");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b86037837183d..f19f2076d1f8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4928,20 +4928,6 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
return true;
}
-bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
- MachineIRBuilder &B,
- bool IsInc) const {
- unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP :
- AMDGPU::G_ATOMICRMW_UDEC_WRAP;
- B.buildInstr(Opc)
- .addDef(MI.getOperand(0).getReg())
- .addUse(MI.getOperand(2).getReg())
- .addUse(MI.getOperand(3).getReg())
- .cloneMemRefs(MI);
- MI.eraseFromParent();
- return true;
-}
-
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
switch (IntrID) {
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
@@ -6215,10 +6201,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return legalizeBufferAtomic(MI, B, IntrID);
- case Intrinsic::amdgcn_atomic_inc:
- return legalizeAtomicIncDec(MI, B, true);
- case Intrinsic::amdgcn_atomic_dec:
- return legalizeAtomicIncDec(MI, B, false);
case Intrinsic::trap:
return legalizeTrapIntrinsic(MI, MRI, B);
case Intrinsic::debugtrap:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 24c2ff68897c1..c93c66dd5ec5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -195,9 +195,6 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
- bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
- bool IsInc) const;
-
bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index bedac1bb1a502..317f3f21d2400 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
def : SourceOfDivergence<int_r600_read_tidig_x>;
def : SourceOfDivergence<int_r600_read_tidig_y>;
def : SourceOfDivergence<int_r600_read_tidig_z>;
-def : SourceOfDivergence<int_amdgcn_atomic_inc>;
-def : SourceOfDivergence<int_amdgcn_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c1428e111e20e..e9ca0635836d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -498,8 +498,6 @@ unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const {
switch (Inst->getIntrinsicID()) {
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
@@ -1010,8 +1008,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const {
switch (IID) {
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
@@ -1032,8 +1028,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *NewV) const {
auto IntrID = II->getIntrinsicID();
switch (IntrID) {
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5725672460b34..e3c63eddb4e7d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1106,8 +1106,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
}
switch (IntrID) {
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
@@ -1258,8 +1256,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
SmallVectorImpl<Value*> &Ops,
Type *&AccessTy) const {
switch (II->getIntrinsicID()) {
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_append:
@@ -7494,8 +7490,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_fadd: {
MemSDNode *M = cast<MemSDNode>(Op);
unsigned Opc;
@@ -7503,12 +7497,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_ds_fadd:
Opc = ISD::ATOMIC_LOAD_FADD;
break;
- case Intrinsic::amdgcn_atomic_inc:
- Opc = ISD::ATOMIC_LOAD_UINC_WRAP;
- break;
- case Intrinsic::amdgcn_atomic_dec:
- Opc = ISD::ATOMIC_LOAD_UDEC_WRAP;
- break;
}
return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
@@ -7520,12 +7508,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
MemSDNode *M = cast<MemSDNode>(Op);
unsigned Opc;
switch (IntrID) {
- case Intrinsic::amdgcn_atomic_inc:
- Opc = ISD::ATOMIC_LOAD_UINC_WRAP;
- break;
- case Intrinsic::amdgcn_atomic_dec:
- Opc = ISD::ATOMIC_LOAD_UDEC_WRAP;
- break;
case Intrinsic::amdgcn_ds_fmin:
Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
break;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
index 3e0236410e65e..a2ef66fe47a09 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
@@ -21,34 +21,6 @@ body: |
SI_RETURN implicit $vgpr0
...
----
-name: test_atomic_inc_dec
-tracksRegLiveness: true
-body: |
- bb.1:
-
- %2:_(s32) = IMPLICIT_DEF
- %3:_(s32) = IMPLICIT_DEF
- %0:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
- %1:_(s32) = IMPLICIT_DEF
- %5:_(s64) = IMPLICIT_DEF
-
- ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.inc)
- %4:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.inc), %0(p1), %1(s32), 0, 0, 0 :: (load store (s32) )
-
- ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.inc)
- %6:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.inc), %0(p1), %5(s64), 0, 0, 0 :: (load store (s64) )
-
- ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.dec)
- %7:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.dec), %0(p1), %1(s32), 0, 0, 0 :: (load store (s32) )
-
- ; CHECK: DIVERGENT: %{{[0-9]}}: %{{[0-9]}}:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.dec)
- %8:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.atomic.dec), %0(p1), %5(s64), 0, 0, 0 :: (load store (s64) )
- $vgpr0 = COPY %4(s32)
- SI_RETURN implicit $vgpr0
-
-...
-
---
name: test_atomics
tracksRegLiveness: true
@@ -103,6 +75,12 @@ body: |
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_FMIN
%18:_(s32) = G_ATOMICRMW_FMIN %1, %5
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UINC_WRAP
+ %19:_(s32) = G_ATOMICRMW_UINC_WRAP %1, %5
+
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UDEC_WRAP
+ %20:_(s32) = G_ATOMICRMW_UDEC_WRAP %1, %5
+
$vgpr0 = COPY %4(s32)
SI_RETURN implicit $vgpr0
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
index bf6c6cdc6f10a..59fbd5627ebfc 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
@@ -15,39 +15,10 @@ define amdgpu_kernel void @test2(ptr %ptr, i32 %cmp, i32 %new) {
ret void
}
-; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false)
-define i32 @test_atomic_inc_i32(ptr addrspace(1) %ptr, i32 %val) #0 {
- %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false)
- ret i32 %ret
-}
-
-; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false)
-define i64 @test_atomic_inc_i64(ptr addrspace(1) %ptr, i64 %val) #0 {
- %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false)
- ret i64 %ret
-}
-
-; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false)
-define i32 @test_atomic_dec_i32(ptr addrspace(1) %ptr, i32 %val) #0 {
- %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 %val, i32 0, i32 0, i1 false)
- ret i32 %ret
-}
-
-; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false)
-define i64 @test_atomic_dec_i64(ptr addrspace(1) %ptr, i64 %val) #0 {
- %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 %val, i32 0, i32 0, i1 false)
- ret i64 %ret
-}
-
-declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #1
-declare i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) nocapture, i64, i32, i32, i1) #1
-declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #1
-declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32, i32, i1) #1
-
; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
define amdgpu_kernel void @test_atomic_csub_i32(ptr addrspace(1) %ptr, i32 %val) #0 {
%ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
- store i32 %ret, ptr addrspace(1) %ptr, align 4
+ store i32 %ret, i32 addrspace(1)* %ptr, align 4
ret void
}
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
new file mode 100644
index 0000000000000..740e0e1d03bab
--- /dev/null
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -0,0 +1,115 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+
+define void @atomic_inc(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw uinc_wrap ptr %ptr0, i32 42 seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw uinc_wrap ptr addrspace(3) %ptr3, i32 46 seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw uinc_wrap ptr %ptr0, i64 48 seq_cst, align 8
+ %result3 = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr0, i64 48, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i64 45 seq_cst, align 8
+ %result4 = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr1, i64 45, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw uinc_wrap ptr addrspace(3) %ptr3, i64 4345 seq_cst, align 8
+ %result5 = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr3, i64 4345, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(3) %ptr3, i64 4345 seq_cst, align 8
+ %result6 = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr3, i64 4345, i32 0, i32 0, i1 true)
+ ret void
+}
+
+define void @atomic_dec(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw udec_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw udec_wrap ptr addrspace(3) %ptr3, i32 46 seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw udec_wrap ptr %ptr0, i64 48 seq_cst, align 8
+ %result3 = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %ptr0, i64 48, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw udec_wrap ptr addrspace(1) %ptr1, i64 45 seq_cst, align 8
+ %result4 = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr1, i64 45, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw udec_wrap ptr addrspace(3) %ptr3, i64 4345 seq_cst, align 8
+ %result5 = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %ptr3, i64 4345, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw volatile udec_wrap ptr addrspace(3) %ptr3, i64 4345 seq_cst, align 8
+ %result6 = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %ptr3, i64 4345, i32 0, i32 0, i1 true)
+ ret void
+}
+
+; Test some invalid ordering handling
+define void @ordering(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw volatile uinc_wrap ptr %ptr0, i32 42 seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr0, i32 42, i32 -1, i32 0, i1 true)
+
+ ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 true)
+
+ ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 1, i32 0, i1 false)
+
+ ; CHECK: atomicrmw volatile uinc_wrap ptr addrspace(1) %ptr1, i32 43 monotonic, align 4
+ %result3 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 2, i32 0, i1 true)
+
+ ; CHECK: atomicrmw uinc_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4
+ %result4 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 false)
+
+ ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 seq_cst, align 4
+ %result5 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 4, i1 true)
+
+ ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 seq_cst, align 4
+ %result6 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 5, i1 false)
+
+ ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 seq_cst, align 4
+ %result7 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 6, i1 true)
+
+ ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 seq_cst, align 4
+ %result8 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 7, i1 false)
+
+ ; CHECK:= atomicrmw volatile udec_wrap ptr %ptr0, i32 42 seq_cst, align 4
+ %result9 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 0, i32 8, i1 true)
+
+ ; CHECK:= atomicrmw volatile udec_wrap ptr addrspace(1) %ptr1, i32 43 seq_cst, align 4
+ %result10 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 true)
+ ret void
+}
+
+define void @immarg_violations(ptr %ptr0, i32 %val32, i1 %val1) {
+ ; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 %val32, i32 0, i1 false)
+
+; CHECK: atomicrmw udec_wrap ptr %ptr0, i32 42 monotonic, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 2, i32 %val32, i1 false)
+
+ ; CHECK: atomicrmw volatile udec_wrap ptr %ptr0, i32 42 monotonic, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr0, i32 42, i32 2, i32 0, i1 %val1)
+ ret void
+}
+
+declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
+
+declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
+
+attributes #0 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
new file mode 100644
index 0000000000000..bd308af4db405
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -0,0 +1,2968 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+; FIXME: Merge with other test. DS offset folding doesn't work due to
+; register bank copies, and no return optimization is missing.
+
+ at lds0 = internal addrspace(3) global [512 x i32] undef
+ at lds1 = internal addrspace(3) global [512 x i64] undef, align 8
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 {
+; CI-LABEL: lds_atomic_dec_ret_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dword s2, s[4:5], 0x2
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: lds_atomic_dec_ret_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[4:5], 0x8
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_dec_ret_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v1, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_dec_rtn_u32 v0, v0, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: lds_atomic_dec_ret_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8
+; GFX10-NEXT: v_mov_b32_e32 v1, 42
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_dec_rtn_u32 v0, v0, v1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_dec_ret_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_dec_rtn_u32 v0, v0, v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 {
+; CI-LABEL: lds_atomic_dec_ret_i32_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dword s2, s[4:5], 0x2
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: lds_atomic_dec_ret_i32_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[4:5], 0x8
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_dec_ret_i32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: lds_atomic_dec_ret_i32_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_dec_ret_i32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 seq_cst, align 4
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) #1 {
+; CI-LABEL: lds_atomic_dec_noret_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dword s0, s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v1, s0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_dec_u32 v1, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: lds_atomic_dec_noret_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s0, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_dec_u32 v1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_dec_noret_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v1, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_dec_u32 v0, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: lds_atomic_dec_noret_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v1, 42
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_dec_u32 v0, v1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_dec_noret_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_dec_u32 v0, v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr) #1 {
+; CI-LABEL: lds_atomic_dec_noret_i32_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dword s0, s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v1, s0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_dec_u32 v1, v0 offset:16
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: lds_atomic_dec_noret_i32_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s0, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_dec_u32 v1, v0 offset:16
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_dec_noret_i32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_dec_u32 v1, v0 offset:16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: lds_atomic_dec_noret_i32_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_dec_u32 v1, v0 offset:16
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_dec_noret_i32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_dec_u32 v1, v0 offset:16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i32 42 seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_ret_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_ret_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_ret_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_ret_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_ret_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_ret_i32_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_u32 s2, s2, 16
+; CI-NEXT: s_addc_u32 s3, s3, 0
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_ret_i32_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s2, s2, 16
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_ret_i32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_ret_i32_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_ret_i32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_noret_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_noret_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_noret_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_noret_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_noret_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1]
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_noret_i32_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_u32 s0, s0, 16
+; CI-NEXT: s_addc_u32 s1, s1, 0
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_noret_i32_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_noret_i32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_noret_i32_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_noret_i32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_store_dword v[0:1], v3
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_store_dword v[0:1], v3
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 42
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[2:3] offset:20 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_ret_i32_offset_addr64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 42
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec v1, v0, v1, s[2:3] offset:20 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_ret_i32_offset_addr64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u32 v1, v0, v1, s[2:3] offset:20 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
+ %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id
+ %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+ store i32 %result, ptr addrspace(1) %out.gep, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 42
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:20
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_noret_i32_offset_addr64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v1, 42
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec v0, v1, s[0:1] offset:20
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_noret_i32_offset_addr64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u32 v0, v1, s[0:1] offset:20
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
+ %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_ret_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_ret_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_ret_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_ret_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_ret_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr %ptr, i32 42 seq_cst, align 4
+ store i32 %result, ptr %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_ret_i32_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_u32 s2, s2, 16
+; CI-NEXT: s_addc_u32 s3, s3, 0
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_ret_i32_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s2, s2, 16
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_u32 s2, s2, 16
+; GFX10-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_ret_i32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
+ store i32 %result, ptr %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_noret_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_noret_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_noret_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec v[0:1], v2
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_noret_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_noret_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr %ptr, i32 42 seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_noret_i32_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_u32 s0, s0, 16
+; CI-NEXT: s_addc_u32 s1, s1, 0
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_noret_i32_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_u32 s0, s0, 16
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_noret_i32_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: flat_store_dword v[0:1], v3
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: flat_store_dword v[0:1], v3
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v3, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec v3, v[0:1], v3 offset:20 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: flat_store_dword v[0:1], v3
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, 42
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: flat_store_dword v[0:1], v3
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v3, 42 :: v_dual_lshlrev_b32 v2, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u32 v3, v[0:1], v3 offset:20 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: flat_store_b32 v[0:1], v3
+; GFX11-NEXT: s_endpgm
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, ptr %ptr, i32 %id
+ %out.gep = getelementptr i32, ptr %out, i32 %id
+ %gep = getelementptr i32, ptr %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
+ store i32 %result, ptr %out.gep, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
+; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
+; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:20
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:20
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i32, ptr %ptr, i32 %id
+ %gep = getelementptr i32, ptr %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_ret_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_add_u32 s0, s0, 4
+; CI-NEXT: s_addc_u32 s1, s1, 0
+; CI-NEXT: v_mov_b32_e32 v5, s1
+; CI-NEXT: v_mov_b32_e32 v4, s0
+; CI-NEXT: flat_store_dword v[2:3], v0
+; CI-NEXT: flat_store_dword v[4:5], v1
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_ret_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_add_u32 s0, s0, 4
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: flat_store_dword v[4:5], v1
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_ret_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_ret_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_ret_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr %ptr, i64 42 seq_cst, align 8
+ store i64 %result, ptr %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_ret_i64_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_u32 s2, s2, 32
+; CI-NEXT: s_addc_u32 s3, s3, 0
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_add_u32 s0, s0, 4
+; CI-NEXT: s_addc_u32 s1, s1, 0
+; CI-NEXT: v_mov_b32_e32 v5, s1
+; CI-NEXT: v_mov_b32_e32 v4, s0
+; CI-NEXT: flat_store_dword v[2:3], v0
+; CI-NEXT: flat_store_dword v[4:5], v1
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_ret_i64_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_add_u32 s0, s0, 4
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: flat_store_dword v[4:5], v1
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_ret_i64_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_ret_i64_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_u32 s2, s2, 32
+; GFX10-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_ret_i64_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i64, ptr %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
+ store i64 %result, ptr %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_noret_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_noret_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_noret_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_noret_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_noret_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr %ptr, i64 42 seq_cst, align 8
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_noret_i64_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_u32 s0, s0, 32
+; CI-NEXT: s_addc_u32 s1, s1, 0
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_noret_i64_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s0, s0, 32
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_noret_i64_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_noret_i64_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_add_u32 s0, s0, 32
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_noret_i64_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i64, ptr %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: v_add_i32_e32 v4, vcc, 4, v2
+; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; CI-NEXT: flat_store_dword v[2:3], v0
+; CI-NEXT: flat_store_dword v[4:5], v1
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2
+; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
+; VI-NEXT: flat_store_dword v[2:3], v0
+; VI-NEXT: flat_store_dword v[4:5], v1
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 42
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, s3
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v4
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX11-NEXT: s_endpgm
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, ptr %ptr, i32 %id
+ %out.gep = getelementptr i64, ptr %out, i32 %id
+ %gep = getelementptr i64, ptr %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
+ store i64 %result, ptr %out.gep, align 4
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, 42
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v4, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:40
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, ptr %ptr, i32 %id
+ %gep = getelementptr i64, ptr %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8
+ ret void
+}
+
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 {
+; CI-LABEL: atomic_dec_shl_base_lds_0:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; CI-NEXT: v_mov_b32_e32 v2, 9
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: flat_store_dword v[0:1], v3
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: atomic_dec_shl_base_lds_0:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; VI-NEXT: v_mov_b32_e32 v2, 9
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dword v[0:1], v3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: atomic_dec_shl_base_lds_0:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 9
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_store_dword v2, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: atomic_dec_shl_base_lds_0:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, 9
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: global_store_dword v2, v0, s[2:3]
+; GFX10-NEXT: global_store_dword v2, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: atomic_dec_shl_base_lds_0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_dec_rtn_u32 v1, v1, v2 offset:8
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
+; GFX11-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+ %idx.0 = add nsw i32 %tid.x, 2
+ %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
+ %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i32 9 seq_cst, align 4
+ store i32 %idx.0, ptr addrspace(1) %add_use, align 4
+ store i32 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 {
+; CI-LABEL: lds_atomic_dec_ret_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dword s2, s[4:5], 0x2
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: lds_atomic_dec_ret_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[4:5], 0x8
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_dec_ret_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: lds_atomic_dec_ret_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_dec_ret_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8
+ store i64 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 {
+; CI-LABEL: lds_atomic_dec_ret_i64_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dword s2, s[4:5], 0x2
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: lds_atomic_dec_ret_i64_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s2, s[4:5], 0x8
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_dec_ret_i64_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: lds_atomic_dec_ret_i64_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s0, s[4:5], 0x8
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_dec_ret_i64_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 seq_cst, align 8
+ store i64 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) #1 {
+; CI-LABEL: lds_atomic_dec_noret_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dword s0, s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_dec_u64 v2, v[0:1]
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: lds_atomic_dec_noret_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s0, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_dec_u64 v2, v[0:1]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_dec_noret_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_dec_u64 v2, v[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: lds_atomic_dec_noret_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_dec_u64 v2, v[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_dec_noret_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_dec_u64 v2, v[0:1]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8
+ ret void
+}
+
+define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr) #1 {
+; CI-LABEL: lds_atomic_dec_noret_i64_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dword s0, s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_dec_u64 v2, v[0:1] offset:32
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: lds_atomic_dec_noret_i64_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s0, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_dec_u64 v2, v[0:1] offset:32
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: lds_atomic_dec_noret_i64_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_dec_u64 v2, v[0:1] offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: lds_atomic_dec_noret_i64_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_dec_u64 v2, v[0:1] offset:32
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: lds_atomic_dec_noret_i64_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_dec_u64 v2, v[0:1] offset:32
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr addrspace(3) %gep, i64 42 seq_cst, align 8
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_ret_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_ret_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_ret_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_ret_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_ret_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8
+ store i64 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_ret_i64_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_u32 s2, s2, 32
+; CI-NEXT: s_addc_u32 s3, s3, 0
+; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_ret_i64_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_ret_i64_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_ret_i64_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_ret_i64_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+ store i64 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_noret_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_noret_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_noret_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_noret_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_noret_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_noret_i64_offset:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_u32 s0, s0, 32
+; CI-NEXT: s_addc_u32 s1, s1, 0
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_noret_i64_offset:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_u32 s0, s0, 32
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_noret_i64_offset:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_noret_i64_offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_noret_i64_offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 42
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_ret_i64_offset_addr64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v1, 42
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_ret_i64_offset_addr64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v1, 42
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_ret_i64_offset_addr64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v1, 42
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
+ %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id
+ %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+ store i64 %result, ptr addrspace(1) %out.gep, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #1 {
+; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v1, 42
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec_x2 v0, v[1:2], s[0:1] offset:40
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: global_atomic_dec_noret_i64_offset_addr64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v1, 42
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_dec_x2 v0, v[1:2], s[0:1] offset:40
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: global_atomic_dec_noret_i64_offset_addr64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 3, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_dec_u64 v0, v[1:2], s[0:1] offset:40
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: s_endpgm
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
+ %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
+ %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+ ret void
+}
+
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 {
+; CI-LABEL: atomic_dec_shl_base_lds_0_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v1, 9
+; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; CI-NEXT: v_mov_b32_e32 v2, 0
+; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v4, s3
+; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
+; CI-NEXT: v_mov_b32_e32 v3, s2
+; CI-NEXT: flat_store_dword v[3:4], v0
+; CI-NEXT: v_mov_b32_e32 v4, s1
+; CI-NEXT: v_mov_b32_e32 v3, s0
+; CI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: atomic_dec_shl_base_lds_0_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v1, 9
+; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
+; VI-NEXT: v_mov_b32_e32 v3, s2
+; VI-NEXT: flat_store_dword v[3:4], v0
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v1, 9
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, 2, v0
+; GFX9-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: atomic_dec_shl_base_lds_0_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v1, 9
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: global_store_dword v3, v0, s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: atomic_dec_shl_base_lds_0_i64:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_mov_b32_e32 v1, 9
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_store_b32 v3, v0, s[2:3]
+; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+ %idx.0 = add nsw i32 %tid.x, 2
+ %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
+ %result = atomicrmw udec_wrap ptr addrspace(3) %arrayidx0, i64 9 seq_cst, align 8
+ store i32 %idx.0, ptr addrspace(1) %add_use, align 4
+ store i64 %result, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+attributes #0 = { nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind memory(none) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
similarity index 74%
rename from llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
rename to llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 17763b474a7fe..c6c8fbf03e91a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -8,18 +8,12 @@
; FIXME: Merge with other test. DS offset folding doesn't work due to
; register bank copies, and no return optimization is missing.
+ at lds0 = internal addrspace(3) global [512 x i32] undef, align 4
+ at lds1 = internal addrspace(3) global [512 x i64] undef, align 8
-declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr nocapture, i32, i32, i32, i1) #2
-
-declare i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) nocapture, i64, i32, i32, i1) #2
-declare i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) nocapture, i64, i32, i32, i1) #2
-declare i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr nocapture, i64, i32, i32, i1) #2
-
-declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare i32 @llvm.amdgcn.workitem.id.x() #0
-define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 {
; CI-LABEL: lds_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[4:5], 0x2
@@ -28,10 +22,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -43,10 +38,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -57,9 +53,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; GFX9-NEXT: v_mov_b32_e32 v1, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -70,9 +67,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -83,22 +83,21 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_inc_rtn_u32 v0, v0, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false), !noalias !0
- store i32 %result, ptr addrspace(1) %out
+ %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
+ store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
-!0 = !{!1}
-!1 = distinct !{!1, !2}
-!2 = distinct !{!2}
-
-define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 {
; CI-LABEL: lds_atomic_inc_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[4:5], 0x2
@@ -107,10 +106,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -122,10 +122,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -136,9 +137,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -149,9 +151,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -162,19 +167,22 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr addrspace(1) %out
+ %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 seq_cst, align 4
+ store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) #1 {
; CI-LABEL: lds_atomic_inc_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[4:5], 0x0
@@ -182,7 +190,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) nounw
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_u32 v1, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_endpgm
;
; VI-LABEL: lds_atomic_inc_noret_i32:
@@ -192,7 +202,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) nounw
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_u32 v1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: lds_atomic_inc_noret_i32:
@@ -201,7 +213,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) nounw
; GFX9-NEXT: v_mov_b32_e32 v1, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_inc_u32 v0, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: lds_atomic_inc_noret_i32:
@@ -210,7 +224,11 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) nounw
; GFX10-NEXT: v_mov_b32_e32 v1, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_inc_u32 v0, v1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lds_atomic_inc_noret_i32:
@@ -218,13 +236,17 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32(ptr addrspace(3) %ptr) nounw
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_inc_u32 v0, v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
ret void
}
-define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr) #1 {
; CI-LABEL: lds_atomic_inc_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[4:5], 0x0
@@ -232,7 +254,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_u32 v1, v0 offset:16
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_endpgm
;
; VI-LABEL: lds_atomic_inc_noret_i32_offset:
@@ -242,7 +266,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_u32 v1, v0 offset:16
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: lds_atomic_inc_noret_i32_offset:
@@ -251,7 +277,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_inc_u32 v1, v0 offset:16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: lds_atomic_inc_noret_i32_offset:
@@ -260,7 +288,11 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_inc_u32 v1, v0 offset:16
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lds_atomic_inc_noret_i32_offset:
@@ -268,14 +300,18 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(ptr addrspace(3) %ptr
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_inc_u32 v1, v0 offset:16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i32 42 seq_cst, align 4
ret void
}
-define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -283,10 +319,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -297,10 +335,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -309,9 +349,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -320,9 +361,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -330,18 +374,21 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr addrspace(1) %out
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4
+ store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -351,10 +398,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -367,10 +416,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -379,9 +430,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -390,9 +442,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -400,19 +455,22 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr addrspace(1) %out
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+ store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -420,7 +478,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) no
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_inc_noret_i32:
@@ -430,7 +491,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) no
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_inc_noret_i32:
@@ -438,8 +502,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) no
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_atomic_inc_noret_i32:
@@ -447,23 +513,30 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) no
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_inc_noret_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1]
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 42 seq_cst, align 4
ret void
}
-define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -473,7 +546,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_inc_noret_i32_offset:
@@ -485,7 +561,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_inc_noret_i32_offset:
@@ -493,8 +572,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_atomic_inc_noret_i32_offset:
@@ -502,24 +583,31 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_inc_noret_i32_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
ret void
}
-define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -532,12 +620,14 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_store_dword v[0:1], v3
; CI-NEXT: s_endpgm
;
@@ -553,12 +643,14 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_endpgm
;
@@ -567,9 +659,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -578,9 +671,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 42
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc v1, v0, v1, s[2:3] offset:20 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -588,9 +684,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u32 v1, v0, v1, s[2:3] offset:20 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -598,12 +697,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id
%gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr addrspace(1) %out.gep
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
+ store i32 %result, ptr addrspace(1) %out.gep, align 4
ret void
}
-define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -616,7 +715,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64:
@@ -631,7 +733,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64:
@@ -639,8 +744,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:20
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_atomic_inc_noret_i32_offset_addr64:
@@ -648,37 +755,43 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 42
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc v0, v1, s[0:1] offset:20
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_inc_noret_i32_offset_addr64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u32 v0, v1, s[0:1] offset:20
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
%gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4
ret void
}
- at lds0 = internal addrspace(3) global [512 x i32] undef, align 4
-
-define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 {
; CI-LABEL: atomic_inc_shl_base_lds_0_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; CI-NEXT: v_mov_b32_e32 v2, 9
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8
-; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_store_dword v[0:1], v3
@@ -693,9 +806,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 9
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8
-; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v3
@@ -709,10 +823,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 9
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: global_store_dword v2, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -723,9 +838,12 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; GFX10-NEXT: v_mov_b32_e32 v2, 9
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-NEXT: global_store_dword v2, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@@ -735,24 +853,27 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_inc_rtn_u32 v1, v1, v2 offset:8
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-NEXT: global_store_b32 v2, v1, s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
- %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
- %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %arrayidx0, i32 9, i32 0, i32 0, i1 false)
- store i32 %idx.0, ptr addrspace(1) %add_use
- store i32 %val0, ptr addrspace(1) %out
+ %result = atomicrmw uinc_wrap ptr addrspace(3) %arrayidx0, i32 9 seq_cst, align 4
+ store i32 %idx.0, ptr addrspace(1) %add_use, align 4
+ store i32 %result, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 {
; CI-LABEL: lds_atomic_inc_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[4:5], 0x2
@@ -762,10 +883,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
@@ -778,10 +900,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -793,9 +916,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -807,9 +931,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -821,18 +948,21 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr addrspace(1) %out
+ %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8
+ store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #1 {
; CI-LABEL: lds_atomic_inc_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[4:5], 0x2
@@ -842,10 +972,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
@@ -858,10 +989,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -873,9 +1005,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -887,9 +1020,12 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -901,19 +1037,22 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr addrspace(1) %out
+ %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 seq_cst, align 8
+ store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) #1 {
; CI-LABEL: lds_atomic_inc_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[4:5], 0x0
@@ -922,7 +1061,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) nounw
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_u64 v2, v[0:1]
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_endpgm
;
; VI-LABEL: lds_atomic_inc_noret_i64:
@@ -933,7 +1074,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) nounw
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_u64 v2, v[0:1]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: lds_atomic_inc_noret_i64:
@@ -943,7 +1086,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) nounw
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_inc_u64 v2, v[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: lds_atomic_inc_noret_i64:
@@ -953,7 +1098,11 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) nounw
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_inc_u64 v2, v[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lds_atomic_inc_noret_i64:
@@ -962,13 +1111,17 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64(ptr addrspace(3) %ptr) nounw
; GFX11-NEXT: v_mov_b32_e32 v0, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_inc_u64 v2, v[0:1]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i64 42 seq_cst, align 8
ret void
}
-define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr) #1 {
; CI-LABEL: lds_atomic_inc_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[4:5], 0x0
@@ -977,7 +1130,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_u64 v2, v[0:1] offset:32
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_endpgm
;
; VI-LABEL: lds_atomic_inc_noret_i64_offset:
@@ -988,7 +1143,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_u64 v2, v[0:1] offset:32
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: lds_atomic_inc_noret_i64_offset:
@@ -998,7 +1155,9 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_inc_u64 v2, v[0:1] offset:32
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: lds_atomic_inc_noret_i64_offset:
@@ -1008,7 +1167,11 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_inc_u64 v2, v[0:1] offset:32
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lds_atomic_inc_noret_i64_offset:
@@ -1017,14 +1180,18 @@ define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(ptr addrspace(3) %ptr
; GFX11-NEXT: v_mov_b32_e32 v0, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_inc_u64 v2, v[0:1] offset:32
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr addrspace(3) %gep, i64 42 seq_cst, align 8
ret void
}
-define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1033,10 +1200,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
@@ -1048,10 +1217,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -1061,9 +1232,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -1073,9 +1245,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -1084,18 +1259,21 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr addrspace(1) %out
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8
+ store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1106,10 +1284,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
@@ -1123,10 +1303,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -1136,9 +1318,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -1148,9 +1331,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -1159,19 +1345,22 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr addrspace(1) %out
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+ store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1180,7 +1369,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) no
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_inc_noret_i64:
@@ -1191,7 +1383,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) no
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_inc_noret_i64:
@@ -1200,8 +1395,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) no
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_atomic_inc_noret_i64:
@@ -1210,8 +1407,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) no
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_inc_noret_i64:
@@ -1219,15 +1420,18 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) no
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 42 seq_cst, align 8
ret void
}
-define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1238,7 +1442,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_inc_noret_i64_offset:
@@ -1251,7 +1458,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_inc_noret_i64_offset:
@@ -1260,8 +1470,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; GFX9-NEXT: v_mov_b32_e32 v0, 42
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_atomic_inc_noret_i64_offset:
@@ -1270,8 +1482,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_inc_noret_i64_offset:
@@ -1279,16 +1495,19 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 42
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
ret void
}
-define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1302,12 +1521,14 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
@@ -1324,12 +1545,14 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -1339,9 +1562,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; GFX9-NEXT: v_mov_b32_e32 v1, 42
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -1351,9 +1575,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; GFX10-NEXT: v_mov_b32_e32 v1, 42
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -1362,9 +1589,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 42
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v3, v[1:2], s[2:3] offset:40 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1372,12 +1602,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
%gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
%out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id
%gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr addrspace(1) %out.gep
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
+ store i64 %result, ptr addrspace(1) %out.gep, align 4
ret void
}
-define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #1 {
; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1391,7 +1621,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64:
@@ -1407,7 +1640,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64:
@@ -1416,8 +1652,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; GFX9-NEXT: v_mov_b32_e32 v1, 42
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_atomic_inc_x2 v0, v[1:2], s[0:1] offset:40
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_atomic_inc_noret_i64_offset_addr64:
@@ -1426,8 +1664,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; GFX10-NEXT: v_mov_b32_e32 v1, 42
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_inc_x2 v0, v[1:2], s[0:1] offset:40
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_inc_noret_i64_offset_addr64:
@@ -1435,31 +1677,86 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_lshlrev_b32 v0, 3, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_inc_u64 v0, v[1:2], s[0:1] offset:40
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
%gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #0 {
-; GCN-LABEL: flat_atomic_inc_ret_i32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-NEXT: v_mov_b32_e32 v2, 42
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT: flat_store_dword v[0:1], v2
-; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_inc_ret_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_inc_ret_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_inc_ret_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_inc_ret_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_inc_ret_i32:
; GFX11: ; %bb.0:
@@ -1467,17 +1764,21 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #0 {
; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 glc
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr %out
+ %result = atomicrmw uinc_wrap ptr %ptr, i32 42 seq_cst, align 4
+ store i32 %result, ptr %out, align 4
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1487,10 +1788,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
@@ -1503,10 +1806,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1517,10 +1822,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_endpgm
;
@@ -1533,10 +1840,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
@@ -1546,27 +1857,76 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #0
; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr %out
+ %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
+ store i32 %result, ptr %out, align 4
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) nounwind {
-; GCN-LABEL: flat_atomic_inc_noret_i32:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GCN-NEXT: v_mov_b32_e32 v2, 42
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: flat_atomic_inc v[0:1], v2
-; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_inc_noret_i32:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_inc v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_inc_noret_i32:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_inc v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_inc_noret_i32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_inc v[0:1], v2
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_inc_noret_i32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_inc v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_inc_noret_i32:
; GFX11: ; %bb.0:
@@ -1574,13 +1934,19 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) nounwind {
; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr %ptr, i32 42 seq_cst, align 4
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1590,7 +1956,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind {
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_inc_noret_i32_offset:
@@ -1602,7 +1971,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind {
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
@@ -1612,7 +1984,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
@@ -1624,7 +1999,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind {
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_inc v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_inc_noret_i32_offset:
@@ -1633,14 +2014,20 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) nounwind {
; GFX11-NEXT: v_mov_b32_e32 v2, 42
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1653,12 +2040,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dword v[0:1], v3
; CI-NEXT: s_endpgm
;
@@ -1674,12 +2063,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_endpgm
;
@@ -1693,12 +2084,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_atomic_inc v3, v[0:1], v3 offset:20 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_dword v[0:1], v3
; GFX9-NEXT: s_endpgm
;
@@ -1714,12 +2107,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v3
; GFX10-NEXT: s_endpgm
;
@@ -1732,24 +2129,28 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u32 v3, v[0:1], v3 offset:20 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v3
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr %ptr, i32 %id
%out.gep = getelementptr i32, ptr %out, i32 %id
%gep = getelementptr i32, ptr %gep.tid, i32 5
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr %out.gep
+ %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
+ store i32 %result, ptr %out.gep, align 4
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -1762,7 +2163,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v2, 42
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc v[0:1], v2
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
@@ -1777,7 +2181,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, 42
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc v[0:1], v2
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
@@ -1790,7 +2197,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:20
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
@@ -1805,7 +2215,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_inc v[0:1], v2
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
@@ -1818,18 +2234,22 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #0
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:20
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, ptr %ptr, i32 %id
%gep = getelementptr i32, ptr %gep.tid, i32 5
- %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4
ret void
}
- at lds1 = internal addrspace(3) global [512 x i64] undef, align 8
-
-define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #1 {
; CI-LABEL: atomic_inc_shl_base_lds_0_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1837,6 +2257,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v4, s3
@@ -1855,6 +2276,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s3
@@ -1872,10 +2294,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
; GFX9-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
; GFX9-NEXT: s_endpgm
@@ -1887,9 +2310,12 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
-; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-NEXT: global_store_dwordx2 v3, v[1:2], s[0:1]
; GFX10-NEXT: s_endpgm
@@ -1900,24 +2326,27 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 3, v0
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
-; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-NEXT: global_store_b64 v3, v[1:2], s[0:1]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
- %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #2
%idx.0 = add nsw i32 %tid.x, 2
%arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
- %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %arrayidx0, i64 9, i32 0, i32 0, i1 false)
- store i32 %idx.0, ptr addrspace(1) %add_use
- store i64 %val0, ptr addrspace(1) %out
+ %result = atomicrmw uinc_wrap ptr addrspace(3) %arrayidx0, i64 9 seq_cst, align 8
+ store i32 %idx.0, ptr addrspace(1) %add_use, align 4
+ store i64 %result, ptr addrspace(1) %out, align 4
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -1926,14 +2355,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 {
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: s_add_u32 s0, s0, 4
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: flat_store_dword v[4:5], v1
; CI-NEXT: s_endpgm
@@ -1946,14 +2377,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_add_u32 s0, s0, 4
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: flat_store_dword v[4:5], v1
; VI-NEXT: s_endpgm
@@ -1966,10 +2399,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -1981,10 +2416,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -1995,17 +2434,21 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #0 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
; GFX11-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] glc
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr %out
+ %result = atomicrmw uinc_wrap ptr %ptr, i64 42 seq_cst, align 8
+ store i64 %result, ptr %out, align 4
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2016,14 +2459,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: s_add_u32 s0, s0, 4
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: flat_store_dword v[4:5], v1
; CI-NEXT: s_endpgm
@@ -2038,14 +2483,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_add_u32 s0, s0, 4
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: flat_store_dword v[4:5], v1
; VI-NEXT: s_endpgm
@@ -2058,10 +2505,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -2075,10 +2524,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -2089,28 +2542,80 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
; GFX11-NEXT: v_mov_b32_e32 v3, s3
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr %out
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
+ store i64 %result, ptr %out, align 4
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) nounwind {
-; GCN-LABEL: flat_atomic_inc_noret_i64:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GCN-NEXT: v_mov_b32_e32 v0, 42
-; GCN-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
-; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
+; CI-LABEL: flat_atomic_inc_noret_i64:
+; CI: ; %bb.0:
+; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CI-NEXT: v_mov_b32_e32 v0, 42
+; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
+; CI-NEXT: s_endpgm
+;
+; VI-LABEL: flat_atomic_inc_noret_i64:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, 42
+; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_endpgm
+;
+; GFX9-LABEL: flat_atomic_inc_noret_i64:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: flat_atomic_inc_noret_i64:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v3, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_inc_noret_i64:
; GFX11: ; %bb.0:
@@ -2119,13 +2624,19 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) nounwind {
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr %ptr, i64 42 seq_cst, align 8
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -2136,7 +2647,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind {
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_inc_noret_i64_offset:
@@ -2149,7 +2663,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind {
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
@@ -2160,7 +2677,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset:
@@ -2173,7 +2693,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind {
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_inc_noret_i64_offset:
@@ -2183,14 +2709,20 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) nounwind {
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
%gep = getelementptr i64, ptr %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
@@ -2204,14 +2736,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: v_add_i32_e32 v4, vcc, 4, v2
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: flat_store_dword v[4:5], v1
; CI-NEXT: s_endpgm
@@ -2229,14 +2763,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: flat_store_dword v[4:5], v1
; VI-NEXT: s_endpgm
@@ -2252,12 +2788,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX9-NEXT: s_endpgm
;
@@ -2274,12 +2812,16 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX10-NEXT: s_endpgm
;
@@ -2294,24 +2836,28 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:40 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
%out.gep = getelementptr i64, ptr %out, i32 %id
%gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr %out.gep
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
+ store i64 %result, ptr %out.gep, align 4
ret void
}
-define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -2325,7 +2871,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0
; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
@@ -2341,7 +2890,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
@@ -2355,7 +2907,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40
+; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
@@ -2371,7 +2926,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 40
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
@@ -2385,16 +2946,22 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:40
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i64, ptr %ptr, i32 %id
%gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
+ %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8
ret void
}
-define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(3) %ptr) #0 {
+define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(3) %ptr) #1 {
; CI-LABEL: nocse_lds_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s6, s[4:5], 0x4
@@ -2403,16 +2970,17 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s6
-; CI-NEXT: ds_inc_rtn_u32 v4, v1, v0
-; CI-NEXT: ds_inc_rtn_u32 v5, v1, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0
+; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: s_waitcnt lgkmcnt(1)
-; CI-NEXT: flat_store_dword v[0:1], v4
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[2:3], v5
+; CI-NEXT: flat_store_dword v[0:1], v2
+; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: v_mov_b32_e32 v1, s3
+; CI-NEXT: flat_store_dword v[0:1], v3
; CI-NEXT: s_endpgm
;
; VI-LABEL: nocse_lds_atomic_inc_ret_i32:
@@ -2423,45 +2991,54 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: ds_inc_rtn_u32 v4, v1, v0
-; VI-NEXT: ds_inc_rtn_u32 v5, v1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_waitcnt lgkmcnt(1)
-; VI-NEXT: flat_store_dword v[0:1], v4
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[2:3], v5
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 42
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0
-; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-NEXT: global_store_dword v1, v2, s[0:1]
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: nocse_lds_atomic_inc_ret_i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dword s0, s[4:5], 0x10
-; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT: ds_inc_rtn_u32 v2, v1, v0
-; GFX10-NEXT: ds_inc_rtn_u32 v0, v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_inc_rtn_u32 v2, v0, v1
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: ds_inc_rtn_u32 v0, v0, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
@@ -2470,25 +3047,33 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v0, s2
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
-; GFX11-NEXT: ds_inc_rtn_u32 v2, v1, v0
-; GFX11-NEXT: ds_inc_rtn_u32 v0, v1, v0
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_inc_rtn_u32 v2, v0, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: ds_inc_rtn_u32 v0, v0, v1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
- %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false)
- %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false)
-
- store i32 %result0, ptr addrspace(1) %out0
- store i32 %result1, ptr addrspace(1) %out1
+ %result0 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
+ %result1 = atomicrmw uinc_wrap ptr addrspace(3) %ptr, i32 42 seq_cst, align 4
+ store i32 %result0, ptr addrspace(1) %out0, align 4
+ store i32 %result1, ptr addrspace(1) %out1, align 4
ret void
}
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind argmemonly }
+attributes #0 = { nounwind speculatable willreturn memory(none) }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind memory(none) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
deleted file mode 100644
index 4bdf1401529d8..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
+++ /dev/null
@@ -1,1259 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CI %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
-
-; FIXME: Merge with other test. DS offset folding doesn't work due to
-; register bank copies, and no return optimization is missing.
-
-
-declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr nocapture, i32, i32, i32, i1) #2
-
-declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32, i32, i1) #2
-declare i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) nocapture, i64, i32, i32, i1) #2
-declare i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr nocapture, i64, i32, i32, i1) #2
-
-declare i32 @llvm.amdgcn.workitem.id.x() #1
-
-define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
-; CI-LABEL: lds_atomic_dec_ret_i32:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dword s2, s[4:5], 0x2
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: lds_atomic_dec_ret_i32:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[4:5], 0x8
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v2
-; VI-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
-; CI-LABEL: lds_atomic_dec_ret_i32_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dword s2, s[4:5], 0x2
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v1, s2
-; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: lds_atomic_dec_ret_i32_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[4:5], 0x8
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v2
-; VI-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @lds_atomic_dec_noret_i32(ptr addrspace(3) %ptr) nounwind {
-; CI-LABEL: lds_atomic_dec_noret_i32:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dword s0, s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v1, s0
-; CI-NEXT: ds_dec_u32 v1, v0
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: lds_atomic_dec_noret_i32:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s0, s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: ds_dec_u32 v1, v0
-; VI-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(ptr addrspace(3) %ptr) nounwind {
-; CI-LABEL: lds_atomic_dec_noret_i32_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dword s0, s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v1, s0
-; CI-NEXT: ds_dec_u32 v1, v0 offset:16
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: lds_atomic_dec_noret_i32_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s0, s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: ds_dec_u32 v1, v0 offset:16
-; VI-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(3) %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %gep, i32 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
-; CI-LABEL: global_atomic_dec_ret_i32:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_ret_i32:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v2
-; VI-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
-; CI-LABEL: global_atomic_dec_ret_i32_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s2, s2, 16
-; CI-NEXT: s_addc_u32 s3, s3, 0
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_ret_i32_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s2, s2, 16
-; VI-NEXT: s_addc_u32 s3, s3, 0
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v2
-; VI-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) nounwind {
-; CI-LABEL: global_atomic_dec_noret_i32:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_noret_i32:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_atomic_dec v[0:1], v2
-; VI-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %ptr, i32 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %ptr) nounwind {
-; CI-LABEL: global_atomic_dec_noret_i32_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s0, s0, 16
-; CI-NEXT: s_addc_u32 s1, s1, 0
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_noret_i32_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_atomic_dec v[0:1], v2
-; VI-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
-; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v3
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v3
-; VI-NEXT: s_endpgm
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
- %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id
- %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr addrspace(1) %out.gep
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspace(1) %ptr) #0 {
-; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: flat_atomic_dec v[0:1], v2
-; VI-NEXT: s_endpgm
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep.tid = getelementptr i32, ptr addrspace(1) %ptr, i32 %id
- %gep = getelementptr i32, ptr addrspace(1) %gep.tid, i32 5
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep, i32 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #0 {
-; CI-LABEL: flat_atomic_dec_ret_i32:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_ret_i32:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v2
-; VI-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr %out
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #0 {
-; CI-LABEL: flat_atomic_dec_ret_i32_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s2, s2, 16
-; CI-NEXT: s_addc_u32 s3, s3, 0
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_ret_i32_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s2, s2, 16
-; VI-NEXT: s_addc_u32 s3, s3, 0
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v2
-; VI-NEXT: s_endpgm
- %gep = getelementptr i32, ptr %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr %out
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) nounwind {
-; CI-LABEL: flat_atomic_dec_noret_i32:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_noret_i32:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_atomic_dec v[0:1], v2
-; VI-NEXT: s_endpgm
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %ptr, i32 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) nounwind {
-; CI-LABEL: flat_atomic_dec_noret_i32_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s0, s0, 16
-; CI-NEXT: s_addc_u32 s1, s1, 0
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_noret_i32_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_atomic_dec v[0:1], v2
-; VI-NEXT: s_endpgm
- %gep = getelementptr i32, ptr %ptr, i32 4
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %ptr) #0 {
-; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[0:1], v3
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[0:1], v3
-; VI-NEXT: s_endpgm
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep.tid = getelementptr i32, ptr %ptr, i32 %id
- %out.gep = getelementptr i32, ptr %out, i32 %id
- %gep = getelementptr i32, ptr %gep.tid, i32 5
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
- store i32 %result, ptr %out.gep
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #0 {
-; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
-; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v2, 42
-; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
-; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, 42
-; VI-NEXT: flat_atomic_dec v[0:1], v2
-; VI-NEXT: s_endpgm
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep.tid = getelementptr i32, ptr %ptr, i32 %id
- %gep = getelementptr i32, ptr %gep.tid, i32 5
- %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %gep, i32 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #0 {
-; CI-LABEL: flat_atomic_dec_ret_i64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: s_add_u32 s0, s0, 4
-; CI-NEXT: s_addc_u32 s1, s1, 0
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[2:3], v0
-; CI-NEXT: flat_store_dword v[4:5], v1
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_ret_i64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_u32 s0, s0, 4
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[2:3], v0
-; VI-NEXT: flat_store_dword v[4:5], v1
-; VI-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr %out
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #0 {
-; CI-LABEL: flat_atomic_dec_ret_i64_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s2, s2, 32
-; CI-NEXT: s_addc_u32 s3, s3, 0
-; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: s_add_u32 s0, s0, 4
-; CI-NEXT: s_addc_u32 s1, s1, 0
-; CI-NEXT: v_mov_b32_e32 v5, s1
-; CI-NEXT: v_mov_b32_e32 v4, s0
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[2:3], v0
-; CI-NEXT: flat_store_dword v[4:5], v1
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_ret_i64_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s2, s2, 32
-; VI-NEXT: s_addc_u32 s3, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_u32 s0, s0, 4
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[2:3], v0
-; VI-NEXT: flat_store_dword v[4:5], v1
-; VI-NEXT: s_endpgm
- %gep = getelementptr i64, ptr %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr %out
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) nounwind {
-; CI-LABEL: flat_atomic_dec_noret_i64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_noret_i64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %ptr, i64 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) nounwind {
-; CI-LABEL: flat_atomic_dec_noret_i64_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s0, s0, 32
-; CI-NEXT: s_addc_u32 s1, s1, 0
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_noret_i64_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %gep = getelementptr i64, ptr %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %ptr) #0 {
-; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: v_add_i32_e32 v4, vcc, 4, v2
-; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CI-NEXT: flat_store_dword v[2:3], v0
-; CI-NEXT: flat_store_dword v[4:5], v1
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v2
-; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; VI-NEXT: flat_store_dword v[2:3], v0
-; VI-NEXT: flat_store_dword v[4:5], v1
-; VI-NEXT: s_endpgm
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep.tid = getelementptr i64, ptr %ptr, i32 %id
- %out.gep = getelementptr i64, ptr %out, i32 %id
- %gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr %out.gep
- ret void
-}
-
-define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #0 {
-; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep.tid = getelementptr i64, ptr %ptr, i32 %id
- %gep = getelementptr i64, ptr %gep.tid, i32 5
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %gep, i64 42, i32 0, i32 0, i1 false)
- ret void
-}
-
- at lds0 = internal addrspace(3) global [512 x i32] undef
-
-define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 {
-; CI-LABEL: atomic_dec_shl_base_lds_0:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; CI-NEXT: v_mov_b32_e32 v2, 9
-; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
-; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: flat_store_dword v[0:1], v3
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: flat_store_dword v[0:1], v2
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: atomic_dec_shl_base_lds_0:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; VI-NEXT: v_mov_b32_e32 v2, 9
-; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
-; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_store_dword v[0:1], v3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: flat_store_dword v[0:1], v2
-; VI-NEXT: s_endpgm
- %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
- %idx.0 = add nsw i32 %tid.x, 2
- %arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(3) @lds0, i32 0, i32 %idx.0
- %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %arrayidx0, i32 9, i32 0, i32 0, i1 false)
- store i32 %idx.0, ptr addrspace(1) %add_use
- store i32 %val0, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
-; CI-LABEL: lds_atomic_dec_ret_i64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dword s2, s[4:5], 0x2
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: lds_atomic_dec_ret_i64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[4:5], 0x8
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(3) %ptr) #0 {
-; CI-LABEL: lds_atomic_dec_ret_i64_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dword s2, s[4:5], 0x2
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: lds_atomic_dec_ret_i64_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[4:5], 0x8
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @lds_atomic_dec_noret_i64(ptr addrspace(3) %ptr) nounwind {
-; CI-LABEL: lds_atomic_dec_noret_i64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dword s0, s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: ds_dec_u64 v2, v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: lds_atomic_dec_noret_i64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s0, s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: ds_dec_u64 v2, v[0:1]
-; VI-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %ptr, i64 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(ptr addrspace(3) %ptr) nounwind {
-; CI-LABEL: lds_atomic_dec_noret_i64_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dword s0, s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: ds_dec_u64 v2, v[0:1] offset:32
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: lds_atomic_dec_noret_i64_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s0, s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: ds_dec_u64 v2, v[0:1] offset:32
-; VI-NEXT: s_endpgm
- %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %gep, i64 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
-; CI-LABEL: global_atomic_dec_ret_i64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_ret_i64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
-; CI-LABEL: global_atomic_dec_ret_i64_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s2, s2, 32
-; CI-NEXT: s_addc_u32 s3, s3, 0
-; CI-NEXT: v_mov_b32_e32 v2, s2
-; CI-NEXT: v_mov_b32_e32 v3, s3
-; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_ret_i64_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s2, s2, 32
-; VI-NEXT: s_addc_u32 s3, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr addrspace(1) %out
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) nounwind {
-; CI-LABEL: global_atomic_dec_noret_i64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_noret_i64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %ptr, i64 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %ptr) nounwind {
-; CI-LABEL: global_atomic_dec_noret_i64_offset:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: s_add_u32 s0, s0, 32
-; CI-NEXT: s_addc_u32 s1, s1, 0
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_noret_i64_offset:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
-; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v4
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; CI-NEXT: v_mov_b32_e32 v3, s1
-; CI-NEXT: v_mov_b32_e32 v2, s0
-; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v4
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
- %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id
- %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
- store i64 %result, ptr addrspace(1) %out.gep
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspace(1) %ptr) #0 {
-; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s0
-; CI-NEXT: v_mov_b32_e32 v1, s1
-; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2
-; CI-NEXT: v_mov_b32_e32 v1, 0
-; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2
-; VI-NEXT: v_mov_b32_e32 v1, 0
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
-; VI-NEXT: s_endpgm
- %id = call i32 @llvm.amdgcn.workitem.id.x()
- %gep.tid = getelementptr i64, ptr addrspace(1) %ptr, i32 %id
- %gep = getelementptr i64, ptr addrspace(1) %gep.tid, i32 5
- %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep, i64 42, i32 0, i32 0, i1 false)
- ret void
-}
-
- at lds1 = internal addrspace(3) global [512 x i64] undef, align 8
-
-define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %add_use) #0 {
-; CI-LABEL: atomic_dec_shl_base_lds_0_i64:
-; CI: ; %bb.0:
-; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CI-NEXT: v_mov_b32_e32 v1, 9
-; CI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; CI-NEXT: v_mov_b32_e32 v2, 0
-; CI-NEXT: s_mov_b32 m0, -1
-; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
-; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v4, s3
-; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, s2
-; CI-NEXT: flat_store_dword v[3:4], v0
-; CI-NEXT: v_mov_b32_e32 v4, s1
-; CI-NEXT: v_mov_b32_e32 v3, s0
-; CI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
-; CI-NEXT: s_endpgm
-;
-; VI-LABEL: atomic_dec_shl_base_lds_0_i64:
-; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v1, 9
-; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
-; VI-NEXT: v_mov_b32_e32 v2, 0
-; VI-NEXT: s_mov_b32 m0, -1
-; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: flat_store_dword v[3:4], v0
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
-; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
-; VI-NEXT: s_endpgm
- %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
- %idx.0 = add nsw i32 %tid.x, 2
- %arrayidx0 = getelementptr inbounds [512 x i64], ptr addrspace(3) @lds1, i32 0, i32 %idx.0
- %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %arrayidx0, i64 9, i32 0, i32 0, i1 false)
- store i32 %idx.0, ptr addrspace(1) %add_use
- store i64 %val0, ptr addrspace(1) %out
- ret void
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind argmemonly }
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 2237c243e6342..13b4b4786b940 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -631,54 +631,6 @@ done:
ret void
}
-; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32(
-; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
-; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %sunkaddr, i32 2, i32 0, i32 0, i1 false)
-define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
-entry:
- %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
- %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
- %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
- %tmp0 = icmp eq i32 %tid, 0
- br i1 %tmp0, label %endif, label %if
-
-if:
- %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %in.gep, i32 2, i32 0, i32 0, i1 false)
- br label %endif
-
-endif:
- %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
- store i32 %x, ptr addrspace(3) %out.gep
- br label %done
-
-done:
- ret void
-}
-
-; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32(
-; OPT: %sunkaddr = getelementptr i8, ptr addrspace(3) %in, i32 28
-; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %sunkaddr, i32 2, i32 0, i32 0, i1 false)
-define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
-entry:
- %out.gep = getelementptr i32, ptr addrspace(3) %out, i32 999999
- %in.gep = getelementptr i32, ptr addrspace(3) %in, i32 7
- %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
- %tmp0 = icmp eq i32 %tid, 0
- br i1 %tmp0, label %endif, label %if
-
-if:
- %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %in.gep, i32 2, i32 0, i32 0, i1 false)
- br label %endif
-
-endif:
- %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
- store i32 %x, ptr addrspace(3) %out.gep
- br label %done
-
-done:
- ret void
-}
-
; OPT-LABEL: @test_sink_global_small_min_scratch_global_offset(
; OPT-SICIVI: %in.gep = getelementptr i8, ptr addrspace(1) %in, i64 -4096
; OPT-SICIV: br
@@ -790,8 +742,6 @@ done:
}
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
-declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #2
declare i32 @llvm.amdgcn.ds.append.p3(ptr addrspace(3) nocapture, i1 immarg) #3
declare i32 @llvm.amdgcn.ds.consume.p3(ptr addrspace(3) nocapture, i1 immarg) #3
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index dddf9ce597308..2ff5c3786b435 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -430,8 +430,9 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_inc v2, v0, v0, s[0:1] glc
+; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[0:1] glc
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: v_mov_b32_e32 v1, s3
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -439,7 +440,7 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %p, i32 0, i32 0, i32 0, i1 false)
+ %n32 = atomicrmw uinc_wrap ptr addrspace(1) %p, i32 1 monotonic
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -451,8 +452,9 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %
; CHECK: ; %bb.0:
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_dec v2, v0, v0, s[0:1] glc
+; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[0:1] glc
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: v_mov_b32_e32 v1, s3
; CHECK-NEXT: s_waitcnt vmcnt(0)
@@ -460,7 +462,7 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
- %n32 = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %p, i32 0, i32 0, i32 0, i1 false)
+ %n32 = atomicrmw udec_wrap ptr addrspace(1) %p, i32 1 monotonic
%n64 = zext i32 %n32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %q, i64 %n64, i32 0
store float 1.0, ptr addrspace(1) %p1
@@ -952,8 +954,6 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) inr
ret void
}
-declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1), i32, i32 immarg, i32 immarg, i1 immarg)
-declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1), i32, i32 immarg, i32 immarg, i1 immarg)
declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1), double)
declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1), double)
declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
index c1bbfa06e9d2b..c0bfd1e66466c 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll
@@ -3360,9 +3360,6 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(ptr addrspace(1) in
; amdgcn atomic inc
; --------------------------------------------------------------------------------
-declare i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
-declare i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
-
define amdgpu_ps float @global_inc_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GCN-LABEL: global_inc_saddr_i32_rtn:
; GCN: ; %bb.0:
@@ -3377,7 +3374,7 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false)
+ %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data monotonic
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -3397,7 +3394,7 @@ define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false)
+ %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data monotonic
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -3415,7 +3412,7 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false)
+ %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i32 %data monotonic
ret void
}
@@ -3433,7 +3430,7 @@ define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false)
+ %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i32 %data monotonic
ret void
}
@@ -3451,7 +3448,7 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false)
+ %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data monotonic
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -3471,7 +3468,7 @@ define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(ptr addrspace(1) i
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false)
+ %rtn = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data monotonic
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -3489,7 +3486,7 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false)
+ %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep0, i64 %data monotonic
ret void
}
@@ -3507,7 +3504,7 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false)
+ %unused = atomicrmw uinc_wrap ptr addrspace(1) %gep1, i64 %data monotonic
ret void
}
@@ -3515,8 +3512,6 @@ define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
; amdgcn atomic dec
; --------------------------------------------------------------------------------
-declare i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
-declare i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
define amdgpu_ps float @global_dec_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
; GCN-LABEL: global_dec_saddr_i32_rtn:
@@ -3532,7 +3527,7 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false)
+ %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data monotonic
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -3552,7 +3547,7 @@ define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false)
+ %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data monotonic
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -3570,7 +3565,7 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn(ptr addrspace(1) inreg %sbase,
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep0, i32 %data, i32 0, i32 0, i1 false)
+ %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i32 %data monotonic
ret void
}
@@ -3588,7 +3583,7 @@ define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %gep1, i32 %data, i32 0, i32 0, i1 false)
+ %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i32 %data monotonic
ret void
}
@@ -3606,7 +3601,7 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(ptr addrspace(1) inreg %s
; GFX11-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false)
+ %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data monotonic
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -3626,7 +3621,7 @@ define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(ptr addrspace(1) i
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false)
+ %rtn = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data monotonic
%cast.rtn = bitcast i64 %rtn to <2 x float>
ret <2 x float> %cast.rtn
}
@@ -3644,7 +3639,7 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn(ptr addrspace(1) inreg %sbase,
; GFX11-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep0, i64 %data, i32 0, i32 0, i1 false)
+ %unused = atomicrmw udec_wrap ptr addrspace(1) %gep0, i64 %data monotonic
ret void
}
@@ -3662,7 +3657,7 @@ define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(ptr addrspace(1) inreg
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %gep1, i64 %data, i32 0, i32 0, i1 false)
+ %unused = atomicrmw udec_wrap ptr addrspace(1) %gep1, i64 %data monotonic
ret void
}
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll
index cca3885780db1..2743f1749adc0 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll
@@ -16,121 +16,8 @@ define i64 @objectsize_global_to_flat_i64(ptr addrspace(3) %global.ptr) #0 {
ret i64 %val
}
-; CHECK-LABEL: @atomicinc_global_to_flat_i32(
-; CHECK: call i32 @llvm.amdgcn.atomic.inc.i32.p1(ptr addrspace(1) %global.ptr, i32 %y, i32 0, i32 0, i1 false)
-define i32 @atomicinc_global_to_flat_i32(ptr addrspace(1) %global.ptr, i32 %y) #0 {
- %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
- %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %cast, i32 %y, i32 0, i32 0, i1 false)
- ret i32 %ret
-}
-
-; CHECK-LABEL: @atomicinc_group_to_flat_i32(
-; CHECK: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %group.ptr, i32 %y, i32 0, i32 0, i1 false)
-define i32 @atomicinc_group_to_flat_i32(ptr addrspace(3) %group.ptr, i32 %y) #0 {
- %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
- %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr %cast, i32 %y, i32 0, i32 0, i1 false)
- ret i32 %ret
-}
-
-; CHECK-LABEL: @atomicinc_global_to_flat_i64(
-; CHECK: call i64 @llvm.amdgcn.atomic.inc.i64.p1(ptr addrspace(1) %global.ptr, i64 %y, i32 0, i32 0, i1 false)
-define i64 @atomicinc_global_to_flat_i64(ptr addrspace(1) %global.ptr, i64 %y) #0 {
- %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
- %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 false)
- ret i64 %ret
-}
-
-; CHECK-LABEL: @atomicinc_group_to_flat_i64(
-; CHECK: call i64 @llvm.amdgcn.atomic.inc.i64.p3(ptr addrspace(3) %group.ptr, i64 %y, i32 0, i32 0, i1 false)
-define i64 @atomicinc_group_to_flat_i64(ptr addrspace(3) %group.ptr, i64 %y) #0 {
- %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
- %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 false)
- ret i64 %ret
-}
-
-; CHECK-LABEL: @atomicdec_global_to_flat_i32(
-; CHECK: call i32 @llvm.amdgcn.atomic.dec.i32.p1(ptr addrspace(1) %global.ptr, i32 %val, i32 0, i32 0, i1 false)
-define i32 @atomicdec_global_to_flat_i32(ptr addrspace(1) %global.ptr, i32 %val) #0 {
- %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
- %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 false)
- ret i32 %ret
-}
-
-; CHECK-LABEL: @atomicdec_group_to_flat_i32(
-; CHECK: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %group.ptr, i32 %val, i32 0, i32 0, i1 false)
-define i32 @atomicdec_group_to_flat_i32(ptr addrspace(3) %group.ptr, i32 %val) #0 {
- %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
- %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 false)
- ret i32 %ret
-}
-
-; CHECK-LABEL: @atomicdec_global_to_flat_i64(
-; CHECK: call i64 @llvm.amdgcn.atomic.dec.i64.p1(ptr addrspace(1) %global.ptr, i64 %y, i32 0, i32 0, i1 false)
-define i64 @atomicdec_global_to_flat_i64(ptr addrspace(1) %global.ptr, i64 %y) #0 {
- %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
- %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 false)
- ret i64 %ret
-}
-
-; CHECK-LABEL: @atomicdec_group_to_flat_i64(
-; CHECK: call i64 @llvm.amdgcn.atomic.dec.i64.p3(ptr addrspace(3) %group.ptr, i64 %y, i32 0, i32 0, i1 false
-define i64 @atomicdec_group_to_flat_i64(ptr addrspace(3) %group.ptr, i64 %y) #0 {
- %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
- %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 false)
- ret i64 %ret
-}
-
-; CHECK-LABEL: @volatile_atomicinc_group_to_flat_i64(
-; CHECK-NEXT: %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
-; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true)
-define i64 @volatile_atomicinc_group_to_flat_i64(ptr addrspace(3) %group.ptr, i64 %y) #0 {
- %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
- %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true)
- ret i64 %ret
-}
-
-; CHECK-LABEL: @volatile_atomicdec_global_to_flat_i32(
-; CHECK-NEXT: %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
-; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 true)
-define i32 @volatile_atomicdec_global_to_flat_i32(ptr addrspace(1) %global.ptr, i32 %val) #0 {
- %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
- %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 true)
- ret i32 %ret
-}
-
-; CHECK-LABEL: @volatile_atomicdec_group_to_flat_i32(
-; CHECK-NEXT: %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
-; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 true)
-define i32 @volatile_atomicdec_group_to_flat_i32(ptr addrspace(3) %group.ptr, i32 %val) #0 {
- %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
- %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr %cast, i32 %val, i32 0, i32 0, i1 true)
- ret i32 %ret
-}
-
-; CHECK-LABEL: @volatile_atomicdec_global_to_flat_i64(
-; CHECK-NEXT: %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
-; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true)
-define i64 @volatile_atomicdec_global_to_flat_i64(ptr addrspace(1) %global.ptr, i64 %y) #0 {
- %cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
- %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true)
- ret i64 %ret
-}
-
-; CHECK-LABEL: @volatile_atomicdec_group_to_flat_i64(
-; CHECK-NEXT: %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
-; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true)
-define i64 @volatile_atomicdec_group_to_flat_i64(ptr addrspace(3) %group.ptr, i64 %y) #0 {
- %cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
- %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr %cast, i64 %y, i32 0, i32 0, i1 true)
- ret i64 %ret
-}
-
declare i32 @llvm.objectsize.i32.p0(ptr, i1, i1, i1) #1
declare i64 @llvm.objectsize.i64.p0(ptr, i1, i1, i1) #1
-declare i32 @llvm.amdgcn.atomic.inc.i32.p0(ptr nocapture, i32, i32, i32, i1) #2
-declare i64 @llvm.amdgcn.atomic.inc.i64.p0(ptr nocapture, i64, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.atomic.dec.i32.p0(ptr nocapture, i32, i32, i32, i1) #2
-declare i64 @llvm.amdgcn.atomic.dec.i64.p0(ptr nocapture, i64, i32, i32, i1) #2
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
index f036a0fc2c76a..798b291691055 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
@@ -87,87 +87,5 @@ bb:
br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
}
-; OPT-LABEL: @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(
-
-; OPT: .lr.ph.preheader:
-; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65532
-; OPT: br label %.lr.ph
-; OPT: .lr.ph:
-; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
-; OPT: %lsr.iv1 = phi ptr addrspace(3) [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
-; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
-; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %lsr.iv3, i32 undef, i32 0, i32 0, i1 false)
-; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %lsr.iv1, i32 undef, i32 0, i32 0, i1 false)
-; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 4
-define amdgpu_kernel void @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(3) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
-bb:
- %tmp = icmp sgt i32 %n, 0
- br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
-
-.lr.ph.preheader: ; preds = %bb
- br label %.lr.ph
-
-._crit_edge.loopexit: ; preds = %.lr.ph
- br label %._crit_edge
-
-._crit_edge: ; preds = %._crit_edge.loopexit, %bb
- ret void
-
-.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
- %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
- %tmp1 = add nuw nsw i32 %indvars.iv, 16383
- %tmp3 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 %tmp1
- %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %tmp3, i32 undef, i32 0, i32 0, i1 false)
- %tmp6 = getelementptr inbounds i32, ptr addrspace(3) %arg0, i32 %indvars.iv
- %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %tmp6, i32 undef, i32 0, i32 0, i1 false)
- %tmp8 = add nsw i32 %tmp7, %tmp4
- atomicrmw add ptr addrspace(3) %tmp6, i32 %tmp8 seq_cst
- %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
- %exitcond = icmp eq i32 %indvars.iv.next, %n
- br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
-}
-
-; OPT-LABEL: @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(
-; OPT: .lr.ph.preheader:
-; OPT: %scevgep2 = getelementptr i8, ptr addrspace(3) %arg1, i32 65532
-; OPT: br label %.lr.ph
-; OPT: .lr.ph:
-; OPT: %lsr.iv3 = phi ptr addrspace(3) [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
-; OPT: %lsr.iv1 = phi ptr addrspace(3) [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
-; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
-; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %lsr.iv3, i32 undef, i32 0, i32 0, i1 false)
-; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %lsr.iv1, i32 undef, i32 0, i32 0, i1 false)
-; OPT: %scevgep4 = getelementptr i8, ptr addrspace(3) %lsr.iv3, i32 4
-define amdgpu_kernel void @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(ptr addrspace(3) noalias nocapture %arg0, ptr addrspace(3) noalias nocapture readonly %arg1, i32 %n) #0 {
-bb:
- %tmp = icmp sgt i32 %n, 0
- br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
-
-.lr.ph.preheader: ; preds = %bb
- br label %.lr.ph
-
-._crit_edge.loopexit: ; preds = %.lr.ph
- br label %._crit_edge
-
-._crit_edge: ; preds = %._crit_edge.loopexit, %bb
- ret void
-
-.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader
- %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
- %tmp1 = add nuw nsw i32 %indvars.iv, 16383
- %tmp3 = getelementptr inbounds i32, ptr addrspace(3) %arg1, i32 %tmp1
- %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %tmp3, i32 undef, i32 0, i32 0, i1 false)
- %tmp6 = getelementptr inbounds i32, ptr addrspace(3) %arg0, i32 %indvars.iv
- %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %tmp6, i32 undef, i32 0, i32 0, i1 false)
- %tmp8 = add nsw i32 %tmp7, %tmp4
- atomicrmw add ptr addrspace(3) %tmp6, i32 %tmp8 seq_cst
- %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
- %exitcond = icmp eq i32 %indvars.iv.next, %n
- br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
-}
-
-declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #1
-declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1) #1
-
attributes #0 = { nounwind }
attributes #1 = { nounwind argmemonly }
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
index 1cae1dfeb5e62..bb370a6d1dfeb 100644
--- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
@@ -142,44 +142,6 @@ define i64 @invalid_nonconstant_fcmp_code(float %a, float %b, i32 %c) {
ret i64 %result
}
-declare i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1)
-define amdgpu_kernel void @invalid_atomic_inc(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %var, i1 %bool) {
- ; CHECK: immarg operand has non-immediate parameter
- ; CHECK-NEXT: i32 %var
- ; CHECK-NEXT: %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 %var, i32 0, i1 false)
- %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 %var, i32 0, i1 false)
-
- ; CHECK: immarg operand has non-immediate parameter
- ; CHECK-NEXT: i32 %var
- ; CHECK-NEXT: %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 %var, i1 false)
- %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 %var, i1 false)
-
- ; CHECK: immarg operand has non-immediate parameter
- ; CHECK-NEXT: i1 %bool
- ; CHECK-NEXT: %result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 %bool)
- %result2 = call i32 @llvm.amdgcn.atomic.inc.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 %bool)
- ret void
-}
-
-declare i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) nocapture, i32, i32, i32, i1)
-define amdgpu_kernel void @invalid_atomic_dec(ptr addrspace(1) %out, ptr addrspace(3) %ptr, i32 %var, i1 %bool) {
- ; CHECK: immarg operand has non-immediate parameter
- ; CHECK-NEXT: i32 %var
- ; CHECK-NEXT: %result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 %var, i32 0, i1 false)
- %result0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 %var, i32 0, i1 false)
-
- ; CHECK: immarg operand has non-immediate parameter
- ; CHECK-NEXT: i32 %var
- ; CHECK-NEXT: %result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 %var, i1 false)
- %result1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 %var, i1 false)
-
- ; CHECK: immarg operand has non-immediate parameter
- ; CHECK-NEXT: i1 %bool
- ; CHECK-NEXT: %result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 %bool)
- %result2 = call i32 @llvm.amdgcn.atomic.dec.i32.p3(ptr addrspace(3) %ptr, i32 42, i32 0, i32 0, i1 %bool)
- ret void
-}
-
declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1)
define amdgpu_kernel void @test_div_scale_f32_val_undef_undef(ptr addrspace(1) %out) {
; CHECK: immarg operand has non-immediate parameter
More information about the llvm-commits
mailing list