[llvm] [NVPTX] Add syncscope support for cmpxchg (PR #140807)
Akshay Deodhar via llvm-commits
llvm-commits at lists.llvm.org
Tue May 20 14:32:54 PDT 2025
https://github.com/akshayrdeodhar created https://github.com/llvm/llvm-project/pull/140807
This MR adds support for cmpxchg instructions with syncscope.
Adds PatFrags for matching syncscope for 3-input atomic operations in the NVPTX backend.
Handle syncscope correctly for emulation loops in AtomicExpand, in bracketInstructionWithFences.
Modifies emitLeadingFence, emitTrailingFence to accept SyncScope as a parameter. Modifies implementation of these in other backends, with
Tests for all possible combinations of the cmpxchg instruction (with modifications to cmpxchg.py)
>From 38db2d3d9066d736b5f4a43e59d0c4ab8c68dab0 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Fri, 9 May 2025 01:00:43 +0000
Subject: [PATCH] [NVPTX] Add syncscope support for cmpxchg
---
llvm/include/llvm/CodeGen/TargetLowering.h | 16 +-
llvm/lib/CodeGen/AtomicExpandPass.cpp | 18 +-
llvm/lib/CodeGen/TargetLoweringBase.cpp | 10 +-
llvm/lib/Target/ARM/ARMISelLowering.cpp | 6 +-
llvm/lib/Target/ARM/ARMISelLowering.h | 10 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 13 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 12 +-
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 67 +-
llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 6 +-
llvm/lib/Target/PowerPC/PPCISelLowering.h | 12 +-
llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +-
llvm/lib/Target/RISCV/RISCVISelLowering.h | 12 +-
llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 8 +-
llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 16140 +++++++++++--
llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 16238 +++++++++++--
llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 21932 ++++++++++++++++--
llvm/test/CodeGen/NVPTX/cmpxchg.ll | 40 +-
llvm/test/CodeGen/NVPTX/cmpxchg.py | 13 +-
18 files changed, 47182 insertions(+), 7377 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index ac9ab7f7fd210..265f1fd724237 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2286,13 +2286,15 @@ class TargetLoweringBase {
/// standard ABI uses a fence before a seq_cst load instead of after a
/// seq_cst store).
/// @{
- virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
-
- virtual Instruction *emitTrailingFence(IRBuilderBase &Builder,
- Instruction *Inst,
- AtomicOrdering Ord) const;
+ virtual Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
+
+ virtual Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const;
/// @}
// Emits code that executes when the comparison result in the ll/sc
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index a3e9700fa3089..1b9e0056eae74 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -65,7 +65,8 @@ class AtomicExpandImpl {
const DataLayout *DL = nullptr;
private:
- bool bracketInstWithFences(Instruction *I, AtomicOrdering Order);
+ bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+ SyncScope::ID SSID = SyncScope::System);
IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
bool tryExpandAtomicLoad(LoadInst *LI);
@@ -303,6 +304,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
if (TLI->shouldInsertFencesForAtomic(I)) {
auto FenceOrdering = AtomicOrdering::Monotonic;
+ SyncScope::ID SSID = SyncScope::System;
if (LI && isAcquireOrStronger(LI->getOrdering())) {
FenceOrdering = LI->getOrdering();
LI->setOrdering(AtomicOrdering::Monotonic);
@@ -325,13 +327,18 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+ SSID = CASI->getSyncScopeID();
CASI->setSuccessOrdering(CASOrdering);
CASI->setFailureOrdering(CASOrdering);
+ // If CAS ordering is monotonic, then the operation will
+ // take default scope. Otherwise, it will retain its scope
+ if (CASOrdering != AtomicOrdering::Monotonic)
+ CASI->setSyncScopeID(SSID);
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
- MadeChange |= bracketInstWithFences(I, FenceOrdering);
+ MadeChange |= bracketInstWithFences(I, FenceOrdering, SSID);
}
} else if (I->hasAtomicStore() &&
TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
@@ -432,12 +439,13 @@ PreservedAnalyses AtomicExpandPass::run(Function &F,
}
bool AtomicExpandImpl::bracketInstWithFences(Instruction *I,
- AtomicOrdering Order) {
+ AtomicOrdering Order,
+ SyncScope::ID SSID) {
ReplacementIRBuilder Builder(I, *DL);
- auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order);
+ auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order, SSID);
- auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order);
+ auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order, SSID);
// We have a guard here because not every atomic operation generates a
// trailing fence.
if (TrailingFence)
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index f5ea3c0b47d6a..61d8b1de30ff7 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2320,18 +2320,20 @@ TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
Instruction *TargetLoweringBase::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
Instruction *TargetLoweringBase::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (isAcquireOrStronger(Ord))
- return Builder.CreateFence(Ord);
+ return Builder.CreateFence(Ord, SSID);
else
return nullptr;
}
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index e3dc337bd0843..9bd5166c19c24 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21221,7 +21221,8 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
@@ -21246,7 +21247,8 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *ARMTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
switch (Ord) {
case AtomicOrdering::NotAtomic:
case AtomicOrdering::Unordered:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..da09eca2b946f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -666,10 +666,12 @@ class VectorType;
void
emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *emitLeadingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
+ Instruction *emitTrailingFence(
+ IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::ID SyncScope::System) const override;
unsigned getMaxSupportedInterleaveFactor() const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3e755c25fd91a..946c44ac82abb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6055,7 +6055,8 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
@@ -6063,15 +6064,17 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
return Ord == AtomicOrdering::SequentiallyConsistent
- ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
- : Builder.CreateFence(AtomicOrdering::Release);
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+ SSID)
+ : Builder.CreateFence(AtomicOrdering::Release, SSID);
return nullptr;
}
Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
// Specialize for cmpxchg
if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
@@ -6084,7 +6087,7 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
if (isAcquireOrStronger(Ord) &&
(Ord != AtomicOrdering::SequentiallyConsistent ||
CASWidth < STI.getMinCmpXchgSizeInBits()))
- return Builder.CreateFence(AtomicOrdering::Acquire);
+ return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index f41c569a65544..07304adf21ac2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -266,10 +266,14 @@ class NVPTXTargetLowering : public TargetLowering {
AtomicOrdering
atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
private:
const NVPTXSubtarget &STI; // cache the subtarget here
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 7d7e69adafcd0..e02c335bc8d13 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -38,6 +38,27 @@ def AS_match {
}];
}
+multiclass nvvm_ternary_atomic_op_scoped<SDPatternOperator frag> {
+ defvar frag_pat = (frag node:$ptr, node:$cmp, node:$val);
+ def NAME#_cta: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Block;
+ }]>;
+ def NAME#_cluster : PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Cluster;
+ }]>;
+ def NAME#_gpu: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::Device;
+ }]>;
+ def NAME#_sys: PatFrag<!setdagop(frag_pat, ops),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val), [{
+ return Scopes[cast<MemSDNode>(N)->getSyncScopeID()] == NVPTX::Scope::System;
+ }]>;
+}
+
+
// A node that will be replaced with the current PTX version.
class PTX {
SDNodeXForm PTXVerXform = SDNodeXForm<imm, [{
@@ -2022,40 +2043,41 @@ multiclass F_ATOMIC_2_NEG<ValueType regT, NVPTXRegClass regclass, string SpaceSt
// has 3 operands
multiclass F_ATOMIC_3_imp<ValueType ptrT, NVPTXRegClass ptrclass,
- ValueType regT, NVPTXRegClass regclass, string SemStr,
- string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
- Operand IMMType, list<Predicate> Pred> {
+ ValueType regT, NVPTXRegClass regclass, string SemStr,
+ string ScopeStr, string SpaceStr, string TypeStr, string OpcStr,
+ PatFrag IntOp, Operand IMMType, list<Predicate> Pred> {
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def reg : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, regclass:$b, regclass:$c),
- !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+ !strconcat("atom", SemStr, ScopeStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>,
Requires<Pred>;
def imm1 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, IMMType:$b, regclass:$c),
- !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+ !strconcat("atom", SemStr, ScopeStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>,
Requires<Pred>;
def imm2 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, regclass:$b, IMMType:$c),
- !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
+ !strconcat("atom", SemStr, ScopeStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>,
Requires<Pred>;
def imm3 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, IMMType:$b, IMMType:$c),
- !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+ !strconcat("atom", SemStr, ScopeStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>,
Requires<Pred>;
}
}
-multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SemStr, string SpaceStr,
- string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
- defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
+multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SemStr, string ScopeStr,
+ string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType,
+ list<Predicate> Pred = []> {
+ defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SemStr, ScopeStr, SpaceStr, TypeStr,
OpcStr, IntOp, IMMType, Pred>;
- defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
+ defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SemStr, ScopeStr, SpaceStr, TypeStr,
OpcStr, IntOp, IMMType, Pred>;
}
@@ -2469,10 +2491,12 @@ foreach size = ["i16", "i32", "i64"] in {
// ".cas", atomic_cmp_swap_i32_acquire_global, i32imm,
// [hasSM<70>, hasPTX<63>]>
multiclass INT_PTX_ATOM_CAS<string atomic_cmp_swap_pat, string type,
- string order, string addrspace, list<Predicate> preds>
+ string order, string scope, string addrspace,
+ list<Predicate> preds>
: F_ATOMIC_3<!cast<ValueType>("i"#type),
!cast<NVPTXRegClass>("Int"#type#"Regs"),
order,
+ scope,
addrspace,
".b"#type,
".cas",
@@ -2487,26 +2511,35 @@ foreach size = ["32", "64"] in {
defvar cas_addrspace_string = !if(!eq(addrspace, "generic"), "", "."#addrspace);
foreach order = ["acquire", "release", "acq_rel", "monotonic"] in {
defvar cas_order_string = !if(!eq(order, "monotonic"), ".relaxed", "."#order);
+ defvar atomic_cmp_swap_pat = !cast<PatFrag>("atomic_cmp_swap_i"#size#_#order#_#addrspace);
+ defm atomic_cmp_swap_i#size#_#order#_#addrspace: nvvm_ternary_atomic_op_scoped<atomic_cmp_swap_pat>;
+
+ foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+ defm INT_PTX_ATOM_CAS_#size#_#order#addrspace#scope
+ : INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace#_#scope, size,
+ cas_order_string, "."#scope, cas_addrspace_string,
+ [hasSM<70>, hasPTX<63>]>;
+ }
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
// Memory orders are only supported for SM70+, PTX63+- so we have two sets of instruction definitions-
// for SM70+, and "old" ones which lower to "atom.cas", for earlier archs.
defm INT_PTX_ATOM_CAS_#size#_#order#addrspace
: INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size,
- cas_order_string, cas_addrspace_string,
+ cas_order_string, "", cas_addrspace_string,
[hasSM<70>, hasPTX<63>]>;
defm INT_PTX_ATOM_CAS_#size#_#order#_old#addrspace
: INT_PTX_ATOM_CAS<"atomic_cmp_swap_i"#size#_#order#_#addrspace, size,
- "", cas_addrspace_string, []>;
+ "", "", cas_addrspace_string, []>;
}
}
}
// Note that 16-bit CAS support in PTX is emulated.
-defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, "", ".global", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, "", "", ".global", ".b16", ".cas",
atomic_cmp_swap_i16_global, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, "", ".shared", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, "", "", ".shared", ".b16", ".cas",
atomic_cmp_swap_i16_shared, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", "", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", "", "", ".b16", ".cas",
atomic_cmp_swap_i16_generic, i16imm, [hasSM<70>, hasPTX<63>]>;
// Support for scoped atomic operations. Matches
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 91df5f467e59c..53a3fcc1008b7 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12430,7 +12430,8 @@ static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
// http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Ord == AtomicOrdering::SequentiallyConsistent)
return callIntrinsic(Builder, Intrinsic::ppc_sync);
if (isReleaseOrStronger(Ord))
@@ -12440,7 +12441,8 @@ Instruction *PPCTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) {
// See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and
// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 1f22aa16a89be..8e02c0dbc0fca 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -921,10 +921,14 @@ namespace llvm {
return true;
}
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
bool shouldInlineQuadwordAtomics() const;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4e6b3a224b79b..2c4000e837f09 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22094,7 +22094,8 @@ void RISCVTargetLowering::LowerAsmOperandForConstraint(
Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Subtarget.hasStdExtZtso()) {
if (isa<LoadInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
return Builder.CreateFence(Ord);
@@ -22110,7 +22111,8 @@ Instruction *RISCVTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *RISCVTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
- AtomicOrdering Ord) const {
+ AtomicOrdering Ord,
+ SyncScope::ID SSID) const {
if (Subtarget.hasStdExtZtso()) {
if (isa<StoreInst>(Inst) && Ord == AtomicOrdering::SequentiallyConsistent)
return Builder.CreateFence(Ord);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 26b888653c81d..0f3b50779b30b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -708,10 +708,14 @@ class RISCVTargetLowering : public TargetLowering {
// than this hook due to limitations in the interface here.
bool shouldInsertFencesForAtomic(const Instruction *I) const override;
- Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
- Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
- AtomicOrdering Ord) const override;
+ Instruction *
+ emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
+ Instruction *
+ emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord,
+ SyncScope::ID SSID = SyncScope::System) const override;
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
EVT VT) const override;
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index 9027bd6a14780..98afc792b3b0b 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
-; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r6, [%r1], %r54, %r32;
; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
@@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r9, [%r1], %r55, %r37;
; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
@@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT: atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r15, [%r10], %r56, %r45;
; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
@@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
-; CHECKPTX71-NEXT: atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r21, [%r16], %r57, %r53;
; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
index ea308c2a7673b..3c00f9585254f 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %}
-define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_monotonic_i8_generic(
+define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_generic_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_monotonic_i8_global(
+define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_generic_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB1_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_monotonic_i8_shared(
+define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB2_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_acquire_i8_generic(
+define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_global_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -158,15 +158,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB3_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB3_1;
; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_acquire_i8_global(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_global_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -204,7 +203,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB4_1;
; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_acquire_i8_shared(
+define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_global_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB5_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB5_1;
; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_seq_cst_i8_generic(
+define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_shared_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB6_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB6_1;
; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_seq_cst_i8_global(
+define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_shared_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -344,15 +338,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB7_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB7_1;
; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: monotonic_seq_cst_i8_shared(
+define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -391,7 +383,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB8_1;
; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
ret i8 %new
}
-define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_monotonic_i8_generic(
+define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_generic_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -437,7 +428,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -458,12 +449,12 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
ret i8 %new
}
-define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_monotonic_i8_global(
+define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_generic_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -471,8 +462,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -483,15 +474,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB10_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -501,15 +492,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB10_1;
; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
ret i8 %new
}
-define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_monotonic_i8_shared(
+define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -517,8 +508,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -529,15 +520,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB11_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -547,15 +538,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB11_1;
; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
ret i8 %new
}
-define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_acquire_i8_generic(
+define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_global_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -563,8 +554,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -575,15 +566,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB12_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -596,12 +587,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
ret i8 %new
}
-define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_acquire_i8_global(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_global_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -609,8 +600,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -621,7 +612,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -639,15 +630,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB13_1;
; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
ret i8 %new
}
-define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_acquire_i8_shared(
+define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_global_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -655,8 +646,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -667,15 +658,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB14_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -685,15 +676,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB14_1;
; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
ret i8 %new
}
-define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_seq_cst_i8_generic(
+define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_shared_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -701,9 +692,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -714,15 +704,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB15_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -735,12 +725,12 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
ret i8 %new
}
-define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_seq_cst_i8_global(
+define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_shared_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -748,9 +738,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -761,15 +750,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB16_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -779,15 +768,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB16_1;
; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
ret i8 %new
}
-define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acquire_seq_cst_i8_shared(
+define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -795,9 +784,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -808,7 +796,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -826,15 +814,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB17_1;
; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
ret i8 %new
}
-define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_monotonic_i8_generic(
+define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_generic_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -842,8 +830,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0];
; SM60-NEXT: membar.sys;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
@@ -855,7 +843,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -873,14 +861,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB18_1;
; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
ret i8 %new
}
-define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_monotonic_i8_global(
+define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_generic_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -888,9 +877,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -901,15 +890,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB19_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -919,14 +908,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB19_1;
; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
ret i8 %new
}
-define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_monotonic_i8_shared(
+define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -934,9 +924,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -947,15 +937,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB20_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -965,14 +955,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB20_1;
; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
ret i8 %new
}
-define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_acquire_i8_generic(
+define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_global_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -980,8 +971,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0];
; SM60-NEXT: membar.sys;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
@@ -993,15 +984,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB21_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1014,12 +1005,12 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
ret i8 %new
}
-define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_acquire_i8_global(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_global_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1027,9 +1018,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1040,7 +1031,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -1058,15 +1049,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB22_1;
; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
ret i8 %new
}
-define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_acquire_i8_shared(
+define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_global_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1074,9 +1065,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1087,15 +1078,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB23_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1105,15 +1096,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB23_1;
; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
ret i8 %new
}
-define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_seq_cst_i8_generic(
+define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_shared_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1121,8 +1112,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0];
; SM60-NEXT: membar.sys;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
@@ -1134,15 +1125,15 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB24_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1155,12 +1146,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
ret i8 %new
}
-define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_seq_cst_i8_global(
+define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_shared_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1168,9 +1159,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1181,15 +1172,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB25_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1199,15 +1190,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB25_1;
; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
ret i8 %new
}
-define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: release_seq_cst_i8_shared(
+define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1215,9 +1206,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1228,7 +1219,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -1246,15 +1237,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB26_1;
; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
ret i8 %new
}
-define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_monotonic_i8_generic(
+define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_generic_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1262,9 +1253,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1275,7 +1265,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -1296,12 +1286,12 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_monotonic_i8_global(
+define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_generic_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1309,9 +1299,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1322,15 +1311,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB28_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1340,15 +1329,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB28_1;
; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_monotonic_i8_shared(
+define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1356,9 +1345,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1369,15 +1357,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB29_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1387,15 +1375,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB29_1;
; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_acquire_i8_generic(
+define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_global_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1403,9 +1391,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1416,15 +1403,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB30_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1437,12 +1424,12 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_acquire_i8_global(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_global_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1450,9 +1437,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1463,7 +1449,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -1481,15 +1467,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB31_1;
; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_acquire_i8_shared(
+define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_global_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1497,9 +1483,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1510,15 +1495,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB32_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1528,15 +1513,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB32_1;
; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i8_generic(
+define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_shared_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1544,9 +1529,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1557,15 +1541,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB33_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1578,12 +1562,12 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i8_global(
+define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_shared_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1591,9 +1575,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1604,15 +1587,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB34_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1622,15 +1605,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB34_1;
; SM60-NEXT: $L__BB34_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i8_shared(
+define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1638,9 +1621,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1651,7 +1633,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB35_1;
; SM60-NEXT: $L__BB35_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
ret i8 %new
}
-define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_monotonic_i8_generic(
+define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_generic_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1685,9 +1667,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1698,7 +1679,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -1719,12 +1700,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_monotonic_i8_global(
+define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_generic_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1732,9 +1713,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1745,15 +1725,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB37_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB37_1;
; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_monotonic_i8_shared(
+define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1779,9 +1759,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1792,15 +1771,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: ld.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB38_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB38_1;
; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_acquire_i8_generic(
+define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_global_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1826,9 +1805,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1839,15 +1817,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB39_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1860,12 +1838,12 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_acquire_i8_global(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_global_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1873,9 +1851,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1886,7 +1863,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -1904,15 +1881,15 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB40_1;
; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_acquire_i8_shared(
+define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_global_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1920,9 +1897,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1933,15 +1909,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB41_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB41_1;
; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i8_generic(
+define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_shared_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -1967,9 +1943,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -1980,15 +1955,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB42_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -2001,12 +1976,12 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: membar.sys;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i8_global(
+define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_shared_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -2014,9 +1989,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -2027,15 +2001,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
-; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
; SM60-NEXT: or.b32 %r17, %r20, %r3;
; SM60-NEXT: or.b32 %r18, %r20, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB43_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB43_1;
; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.cta;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i8_shared(
+define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
@@ -2061,9 +2035,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
; SM60-NEXT: and.b32 %r10, %r9, 3;
@@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1];
; SM60-NEXT: shl.b32 %r4, %r15, %r1;
; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
; SM60-NEXT: and.b32 %r20, %r16, %r2;
@@ -2092,3589 +2065,14968 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB44_1;
; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
+; SM60-NEXT: membar.gl;
; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
ret i8 %new
}
-define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_monotonic_i16_generic(
+define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_generic_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB45_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB45_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB45_1;
; SM60-NEXT: $L__BB45_3: // %partword.cmpxchg.end
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_monotonic_i16_global(
+define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_generic_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB46_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB46_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB46_1;
; SM60-NEXT: $L__BB46_3: // %partword.cmpxchg.end
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_monotonic_i16_shared(
+define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB47_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB47_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB47_1;
; SM60-NEXT: $L__BB47_3: // %partword.cmpxchg.end
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_acquire_i16_generic(
+define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_global_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0];
+; SM60-NEXT: membar.sys;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB48_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB48_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB48_1;
; SM60-NEXT: $L__BB48_3: // %partword.cmpxchg.end
; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_acquire_i16_global(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_global_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB49_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB49_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB49_1;
; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_acquire_i16_shared(
+define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_global_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB50_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB50_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB50_1;
; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_seq_cst_i16_generic(
+define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_shared_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB51_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB51_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB51_1;
; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end
; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_seq_cst_i16_global(
+define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_shared_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB52_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB52_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB52_1;
; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: monotonic_seq_cst_i16_shared(
+define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB53_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB53_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB53_1;
; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+ ret i8 %new
}
-define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_monotonic_i16_generic(
+define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_generic_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB54_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB54_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB54_1;
; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+ ret i8 %new
}
-define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_monotonic_i16_global(
+define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_generic_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB55_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB55_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB55_1;
; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+ ret i8 %new
}
-define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_monotonic_i16_shared(
+define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB56_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB56_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB56_1;
; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+ ret i8 %new
}
-define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_acquire_i16_generic(
+define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_global_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0];
+; SM60-NEXT: membar.sys;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB57_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB57_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB57_1;
; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+ ret i8 %new
}
-define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_acquire_i16_global(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_global_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB58_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB58_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB58_1;
; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+ ret i8 %new
}
-define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_acquire_i16_shared(
+define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_global_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB59_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB59_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB59_1;
; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+ ret i8 %new
}
-define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_seq_cst_i16_generic(
+define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_shared_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB60_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB60_1;
; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+ ret i8 %new
}
-define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_seq_cst_i16_global(
+define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_shared_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB61_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB61_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB61_1;
; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+ ret i8 %new
}
-define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acquire_seq_cst_i16_shared(
+define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB62_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB62_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB62_1;
; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+ ret i8 %new
}
-define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_monotonic_i16_generic(
+define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_generic_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB63_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB63_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB63_1;
; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire
+ ret i8 %new
}
-define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_monotonic_i16_global(
+define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_generic_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB64_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB64_1;
; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+ ret i8 %new
}
-define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_monotonic_i16_shared(
+define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB65_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB65_1;
; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+ ret i8 %new
}
-define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_acquire_i16_generic(
+define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_global_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB66_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB66_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB66_1;
; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end
; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+ ret i8 %new
}
-define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_acquire_i16_global(
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_global_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB67_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB67_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB67_1;
; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+ ret i8 %new
}
-define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_acquire_i16_shared(
+define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_global_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB68_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB68_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB68_1;
; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+ ret i8 %new
}
-define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_seq_cst_i16_generic(
+define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_shared_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB69_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB69_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB69_1;
; SM60-NEXT: $L__BB69_3: // %partword.cmpxchg.end
; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+ ret i8 %new
}
-define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_seq_cst_i16_global(
+define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_shared_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB70_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB70_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB70_1;
; SM60-NEXT: $L__BB70_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+ ret i8 %new
}
-define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: release_seq_cst_i16_shared(
+define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB71_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB71_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB71_1;
; SM60-NEXT: $L__BB71_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+ ret i8 %new
}
-define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_monotonic_i16_generic(
+define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_generic_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
-; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB72_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB72_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB72_1;
; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end
; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_monotonic_i16_global(
+define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_generic_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB73_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB73_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB73_1;
; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_monotonic_i16_shared(
+define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB74_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB74_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB74_1;
; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_acquire_i16_generic(
+define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_global_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB75_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB75_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB75_1;
; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end
; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_acquire_i16_global(
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_global_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB76_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB76_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB76_1;
; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_acquire_i16_shared(
+define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_global_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB77_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB77_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB77_1;
; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i16_generic(
+define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_shared_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB78_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB78_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB78_1;
; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end
; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i16_global(
+define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_shared_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB79_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB79_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB79_1;
; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i16_shared(
+define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB80_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB80_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB80_1;
; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+ ret i8 %new
}
-define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_monotonic_i16_generic(
+define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_generic_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB81_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB81_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB81_1;
; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end
; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_monotonic_i16_global(
+define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_generic_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB82_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB82_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB82_1;
; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_monotonic_i16_shared(
+define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB83_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB83_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB83_1;
; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_acquire_i16_generic(
+define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_global_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB84_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB84_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB84_1;
; SM60-NEXT: $L__BB84_3: // %partword.cmpxchg.end
; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_acquire_i16_global(
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_global_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB85_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB85_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB85_1;
; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_acquire_i16_shared(
+define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_global_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB86_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB86_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB86_1;
; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i16_generic(
+define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_shared_sys(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB87_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB87_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB87_1;
; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end
; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i16_global(
+define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_shared_cta(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.global.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB88_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB88_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB88_1;
; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i16_shared(
+define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .pred %p<3>;
; SM60-NEXT: .reg .b16 %rs<2>;
-; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b32 %r<21>;
; SM60-NEXT: .reg .b64 %rd<3>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
; SM60-NEXT: and.b64 %rd1, %rd2, -4;
-; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM60-NEXT: and.b32 %r11, %r10, 3;
-; SM60-NEXT: shl.b32 %r1, %r11, 3;
-; SM60-NEXT: mov.b32 %r12, 65535;
-; SM60-NEXT: shl.b32 %r13, %r12, %r1;
-; SM60-NEXT: not.b32 %r2, %r13;
-; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
; SM60-NEXT: shl.b32 %r3, %r14, %r1;
-; SM60-NEXT: shl.b32 %r4, %r9, %r1;
-; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop
; SM60-NEXT: // =>This Inner Loop Header: Depth=1
-; SM60-NEXT: or.b32 %r16, %r19, %r3;
-; SM60-NEXT: or.b32 %r17, %r19, %r4;
-; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM60-NEXT: @%p1 bra $L__BB89_3;
; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM60-NEXT: // in Loop: Header=BB89_1 Depth=1
; SM60-NEXT: and.b32 %r8, %r7, %r2;
-; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
; SM60-NEXT: @%p2 bra $L__BB89_1;
; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB90_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB90_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB90_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB90_1;
+; SM60-NEXT: $L__BB90_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB91_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB91_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB91_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB91_1;
+; SM60-NEXT: $L__BB91_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB92_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB92_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB92_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB92_1;
+; SM60-NEXT: $L__BB92_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB93_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB93_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB93_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB93_1;
+; SM60-NEXT: $L__BB93_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB94_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB94_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB94_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB94_1;
+; SM60-NEXT: $L__BB94_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB95_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB95_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB95_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB95_1;
+; SM60-NEXT: $L__BB95_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB96_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB96_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB96_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB96_1;
+; SM60-NEXT: $L__BB96_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB97_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB97_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB97_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB97_1;
+; SM60-NEXT: $L__BB97_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB98_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB98_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB98_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB98_1;
+; SM60-NEXT: $L__BB98_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB99_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB99_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB99_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB99_1;
+; SM60-NEXT: $L__BB99_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB100_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB100_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB100_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB100_1;
+; SM60-NEXT: $L__BB100_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB101_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB101_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB101_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB101_1;
+; SM60-NEXT: $L__BB101_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB102_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB102_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB102_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB102_1;
+; SM60-NEXT: $L__BB102_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB103_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB103_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB103_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB103_1;
+; SM60-NEXT: $L__BB103_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB104_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB104_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB104_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB104_1;
+; SM60-NEXT: $L__BB104_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB105_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB105_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB105_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB105_1;
+; SM60-NEXT: $L__BB105_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB106_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB106_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB106_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB106_1;
+; SM60-NEXT: $L__BB106_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB107_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB107_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB107_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB107_1;
+; SM60-NEXT: $L__BB107_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB108_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB108_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB108_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB108_1;
+; SM60-NEXT: $L__BB108_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB109_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB109_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB109_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB109_1;
+; SM60-NEXT: $L__BB109_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB110_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB110_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB110_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB110_1;
+; SM60-NEXT: $L__BB110_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB111_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB111_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB111_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB111_1;
+; SM60-NEXT: $L__BB111_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB112_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB112_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB112_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB112_1;
+; SM60-NEXT: $L__BB112_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB113_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB113_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB113_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB113_1;
+; SM60-NEXT: $L__BB113_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB114_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB114_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB114_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB114_1;
+; SM60-NEXT: $L__BB114_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB115_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB115_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB115_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB115_1;
+; SM60-NEXT: $L__BB115_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB116_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB116_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB116_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB116_1;
+; SM60-NEXT: $L__BB116_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB117_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB117_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB117_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB117_1;
+; SM60-NEXT: $L__BB117_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB118_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB118_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB118_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB118_1;
+; SM60-NEXT: $L__BB118_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB119_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB119_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB119_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB119_1;
+; SM60-NEXT: $L__BB119_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB120_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB120_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB120_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB120_1;
+; SM60-NEXT: $L__BB120_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB121_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB121_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB121_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB121_1;
+; SM60-NEXT: $L__BB121_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB122_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB122_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB122_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB122_1;
+; SM60-NEXT: $L__BB122_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB123_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB123_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB123_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB123_1;
+; SM60-NEXT: $L__BB123_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB124_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB124_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB124_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB124_1;
+; SM60-NEXT: $L__BB124_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB125_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB125_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB125_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB125_1;
+; SM60-NEXT: $L__BB125_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB126_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB126_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB126_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB126_1;
+; SM60-NEXT: $L__BB126_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB127_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB127_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB127_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB127_1;
+; SM60-NEXT: $L__BB127_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB128_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB128_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB128_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB128_1;
+; SM60-NEXT: $L__BB128_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB129_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB129_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB129_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB129_1;
+; SM60-NEXT: $L__BB129_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB130_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB130_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB130_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB130_1;
+; SM60-NEXT: $L__BB130_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB131_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB131_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB131_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB131_1;
+; SM60-NEXT: $L__BB131_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB132_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB132_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB132_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB132_1;
+; SM60-NEXT: $L__BB132_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB133_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB133_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB133_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB133_1;
+; SM60-NEXT: $L__BB133_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB134_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB134_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB134_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB134_1;
+; SM60-NEXT: $L__BB134_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB135_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB135_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB135_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB135_1;
+; SM60-NEXT: $L__BB135_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB136_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB136_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB136_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB136_1;
+; SM60-NEXT: $L__BB136_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB137_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB137_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB137_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB137_1;
+; SM60-NEXT: $L__BB137_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB138_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB138_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB138_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB138_1;
+; SM60-NEXT: $L__BB138_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB139_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB139_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB139_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB139_1;
+; SM60-NEXT: $L__BB139_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB140_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB140_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB140_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB140_1;
+; SM60-NEXT: $L__BB140_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB141_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB141_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB141_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB141_1;
+; SM60-NEXT: $L__BB141_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB142_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB142_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB142_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB142_1;
+; SM60-NEXT: $L__BB142_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB143_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB143_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB143_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB143_1;
+; SM60-NEXT: $L__BB143_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB144_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB144_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB144_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB144_1;
+; SM60-NEXT: $L__BB144_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB145_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB145_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB145_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB145_1;
+; SM60-NEXT: $L__BB145_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB146_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB146_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB146_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB146_1;
+; SM60-NEXT: $L__BB146_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB147_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB147_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB147_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB147_1;
+; SM60-NEXT: $L__BB147_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB148_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB148_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB148_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB148_1;
+; SM60-NEXT: $L__BB148_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB149_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB149_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB149_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB149_1;
+; SM60-NEXT: $L__BB149_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB150_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB150_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB150_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB150_1;
+; SM60-NEXT: $L__BB150_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB151_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB151_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB151_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB151_1;
+; SM60-NEXT: $L__BB151_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB152_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB152_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB152_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB152_1;
+; SM60-NEXT: $L__BB152_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB153_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB153_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB153_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB153_1;
+; SM60-NEXT: $L__BB153_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB154_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB154_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB154_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB154_1;
+; SM60-NEXT: $L__BB154_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB155_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB155_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB155_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB155_1;
+; SM60-NEXT: $L__BB155_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB156_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB156_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB156_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB156_1;
+; SM60-NEXT: $L__BB156_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB157_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB157_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB157_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB157_1;
+; SM60-NEXT: $L__BB157_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB158_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB158_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB158_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB158_1;
+; SM60-NEXT: $L__BB158_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB159_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB159_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB159_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB159_1;
+; SM60-NEXT: $L__BB159_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB160_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB160_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB160_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB160_1;
+; SM60-NEXT: $L__BB160_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB161_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB161_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB161_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB161_1;
+; SM60-NEXT: $L__BB161_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB162_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB162_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB162_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB162_1;
+; SM60-NEXT: $L__BB162_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB163_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB163_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB163_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB163_1;
+; SM60-NEXT: $L__BB163_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB164_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB164_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB164_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB164_1;
+; SM60-NEXT: $L__BB164_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB165_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB165_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB165_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB165_1;
+; SM60-NEXT: $L__BB165_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB166_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB166_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB166_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB166_1;
+; SM60-NEXT: $L__BB166_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB167_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB167_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB167_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB167_1;
+; SM60-NEXT: $L__BB167_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB168_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB168_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB168_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB168_1;
+; SM60-NEXT: $L__BB168_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB169_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB169_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB169_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB169_1;
+; SM60-NEXT: $L__BB169_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB170_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB170_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB170_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB170_1;
+; SM60-NEXT: $L__BB170_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB171_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB171_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB171_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB171_1;
+; SM60-NEXT: $L__BB171_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB172_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB172_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB172_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB172_1;
+; SM60-NEXT: $L__BB172_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB173_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB173_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB173_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB173_1;
+; SM60-NEXT: $L__BB173_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB174_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB174_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB174_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB174_1;
+; SM60-NEXT: $L__BB174_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB175_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB175_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB175_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB175_1;
+; SM60-NEXT: $L__BB175_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB176_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB176_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB176_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB176_1;
+; SM60-NEXT: $L__BB176_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB177_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB177_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB177_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB177_1;
+; SM60-NEXT: $L__BB177_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB178_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB178_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB178_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB178_1;
+; SM60-NEXT: $L__BB178_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB179_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB179_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB179_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB179_1;
+; SM60-NEXT: $L__BB179_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB180_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB180_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB180_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB180_1;
+; SM60-NEXT: $L__BB180_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB181_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB181_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB181_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB181_1;
+; SM60-NEXT: $L__BB181_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB182_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB182_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB182_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB182_1;
+; SM60-NEXT: $L__BB182_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB183_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB183_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB183_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB183_1;
+; SM60-NEXT: $L__BB183_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB184_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB184_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB184_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB184_1;
+; SM60-NEXT: $L__BB184_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB185_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB185_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB185_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB185_1;
+; SM60-NEXT: $L__BB185_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB186_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB186_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB186_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB186_1;
+; SM60-NEXT: $L__BB186_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB187_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB187_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB187_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB187_1;
+; SM60-NEXT: $L__BB187_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB188_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB188_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB188_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB188_1;
+; SM60-NEXT: $L__BB188_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB189_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB189_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB189_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB189_1;
+; SM60-NEXT: $L__BB189_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB190_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB190_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB190_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB190_1;
+; SM60-NEXT: $L__BB190_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB191_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB191_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB191_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB191_1;
+; SM60-NEXT: $L__BB191_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB192_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB192_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB192_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB192_1;
+; SM60-NEXT: $L__BB192_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB193_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB193_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB193_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB193_1;
+; SM60-NEXT: $L__BB193_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB194_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB194_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB194_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB194_1;
+; SM60-NEXT: $L__BB194_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB195_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB195_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB195_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB195_1;
+; SM60-NEXT: $L__BB195_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB196_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB196_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB196_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB196_1;
+; SM60-NEXT: $L__BB196_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB197_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB197_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB197_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB197_1;
+; SM60-NEXT: $L__BB197_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB198_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB198_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB198_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB198_1;
+; SM60-NEXT: $L__BB198_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB199_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB199_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB199_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB199_1;
+; SM60-NEXT: $L__BB199_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB200_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB200_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB200_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB200_1;
+; SM60-NEXT: $L__BB200_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB201_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB201_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB201_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB201_1;
+; SM60-NEXT: $L__BB201_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB202_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB202_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB202_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB202_1;
+; SM60-NEXT: $L__BB202_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB203_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB203_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB203_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB203_1;
+; SM60-NEXT: $L__BB203_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB204_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB204_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB204_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB204_1;
+; SM60-NEXT: $L__BB204_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB205_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB205_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB205_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB205_1;
+; SM60-NEXT: $L__BB205_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB206_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB206_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB206_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB206_1;
+; SM60-NEXT: $L__BB206_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB207_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB207_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB207_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB207_1;
+; SM60-NEXT: $L__BB207_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB208_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB208_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB208_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB208_1;
+; SM60-NEXT: $L__BB208_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB209_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB209_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB209_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB209_1;
+; SM60-NEXT: $L__BB209_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB210_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB210_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB210_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB210_1;
+; SM60-NEXT: $L__BB210_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB211_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB211_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB211_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB211_1;
+; SM60-NEXT: $L__BB211_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB212_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB212_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB212_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB212_1;
+; SM60-NEXT: $L__BB212_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB213_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB213_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB213_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB213_1;
+; SM60-NEXT: $L__BB213_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB214_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB214_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB214_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB214_1;
+; SM60-NEXT: $L__BB214_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB215_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB215_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB215_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB215_1;
+; SM60-NEXT: $L__BB215_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB216_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB216_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB216_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB216_1;
+; SM60-NEXT: $L__BB216_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB217_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB217_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB217_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB217_1;
+; SM60-NEXT: $L__BB217_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB218_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB218_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB218_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB218_1;
+; SM60-NEXT: $L__BB218_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB219_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB219_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB219_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB219_1;
+; SM60-NEXT: $L__BB219_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB220_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB220_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB220_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB220_1;
+; SM60-NEXT: $L__BB220_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB221_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB221_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB221_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB221_1;
+; SM60-NEXT: $L__BB221_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB222_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB222_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB222_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB222_1;
+; SM60-NEXT: $L__BB222_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB223_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB223_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB223_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB223_1;
+; SM60-NEXT: $L__BB223_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB224_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB224_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB224_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB224_1;
+; SM60-NEXT: $L__BB224_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB225_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB225_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB225_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB225_1;
+; SM60-NEXT: $L__BB225_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB226_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB226_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB226_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB226_1;
+; SM60-NEXT: $L__BB226_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB227_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB227_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB227_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB227_1;
+; SM60-NEXT: $L__BB227_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB228_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB228_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB228_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB228_1;
+; SM60-NEXT: $L__BB228_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB229_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB229_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB229_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB229_1;
+; SM60-NEXT: $L__BB229_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB230_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB230_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB230_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB230_1;
+; SM60-NEXT: $L__BB230_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB231_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB231_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB231_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB231_1;
+; SM60-NEXT: $L__BB231_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB232_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB232_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB232_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB232_1;
+; SM60-NEXT: $L__BB232_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB233_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB233_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB233_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB233_1;
+; SM60-NEXT: $L__BB233_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB234_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB234_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB234_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB234_1;
+; SM60-NEXT: $L__BB234_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB235_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB235_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB235_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB235_1;
+; SM60-NEXT: $L__BB235_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB236_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB236_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB236_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB236_1;
+; SM60-NEXT: $L__BB236_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB237_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB237_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB237_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB237_1;
+; SM60-NEXT: $L__BB237_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB238_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB238_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB238_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB238_1;
+; SM60-NEXT: $L__BB238_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB239_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB239_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB239_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB239_1;
+; SM60-NEXT: $L__BB239_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB240_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB240_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB240_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB240_1;
+; SM60-NEXT: $L__BB240_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB241_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB241_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB241_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB241_1;
+; SM60-NEXT: $L__BB241_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB242_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB242_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB242_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB242_1;
+; SM60-NEXT: $L__BB242_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB243_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB243_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB243_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB243_1;
+; SM60-NEXT: $L__BB243_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB244_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB244_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB244_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB244_1;
+; SM60-NEXT: $L__BB244_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB245_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB245_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB245_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB245_1;
+; SM60-NEXT: $L__BB245_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB246_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB246_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB246_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB246_1;
+; SM60-NEXT: $L__BB246_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB247_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB247_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB247_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB247_1;
+; SM60-NEXT: $L__BB247_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB248_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB248_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB248_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB248_1;
+; SM60-NEXT: $L__BB248_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB249_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB249_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB249_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB249_1;
+; SM60-NEXT: $L__BB249_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB250_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB250_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB250_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB250_1;
+; SM60-NEXT: $L__BB250_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB251_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB251_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB251_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB251_1;
+; SM60-NEXT: $L__BB251_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB252_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB252_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB252_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB252_1;
+; SM60-NEXT: $L__BB252_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB253_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB253_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB253_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB253_1;
+; SM60-NEXT: $L__BB253_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB254_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB254_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB254_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB254_1;
+; SM60-NEXT: $L__BB254_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB255_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB255_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB255_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB255_1;
+; SM60-NEXT: $L__BB255_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB256_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB256_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB256_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB256_1;
+; SM60-NEXT: $L__BB256_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB257_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB257_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB257_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB257_1;
+; SM60-NEXT: $L__BB257_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB258_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB258_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB258_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB258_1;
+; SM60-NEXT: $L__BB258_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB259_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB259_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB259_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB259_1;
+; SM60-NEXT: $L__BB259_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB260_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB260_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB260_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB260_1;
+; SM60-NEXT: $L__BB260_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB261_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB261_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB261_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB261_1;
+; SM60-NEXT: $L__BB261_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB262_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB262_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB262_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB262_1;
+; SM60-NEXT: $L__BB262_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB263_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB263_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB263_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB263_1;
+; SM60-NEXT: $L__BB263_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB264_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB264_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB264_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB264_1;
+; SM60-NEXT: $L__BB264_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB265_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB265_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB265_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB265_1;
+; SM60-NEXT: $L__BB265_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB266_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB266_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB266_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB266_1;
+; SM60-NEXT: $L__BB266_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB267_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB267_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB267_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB267_1;
+; SM60-NEXT: $L__BB267_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB268_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB268_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB268_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB268_1;
+; SM60-NEXT: $L__BB268_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB269_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB269_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB269_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB269_1;
+; SM60-NEXT: $L__BB269_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_generic_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_generic_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_generic_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_global_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_global_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_shared_sys(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_shared_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_shared_gpu(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+ ret i64 %new
}
-define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_monotonic_i32_generic(
+define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_generic_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+ ret i64 %new
}
-define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_monotonic_i32_global(
+define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_generic_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+ ret i64 %new
}
-define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_monotonic_i32_shared(
+define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_generic_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+ ret i64 %new
}
-define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_acquire_i32_generic(
+define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_global_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+ ret i64 %new
}
-define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_acquire_i32_global(
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_global_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+ ret i64 %new
}
-define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_acquire_i32_shared(
+define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_global_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+ ret i64 %new
}
-define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_seq_cst_i32_generic(
+define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_shared_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+ ret i64 %new
}
-define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_seq_cst_i32_global(
+define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_shared_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+ ret i64 %new
}
-define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: monotonic_seq_cst_i32_shared(
+define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_shared_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+ ret i64 %new
}
-define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_monotonic_i32_generic(
+define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_generic_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+ ret i64 %new
}
-define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_monotonic_i32_global(
+define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_generic_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+ ret i64 %new
}
-define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_monotonic_i32_shared(
+define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_generic_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+ ret i64 %new
}
-define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_acquire_i32_generic(
+define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_global_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+ ret i64 %new
}
-define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_acquire_i32_global(
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_global_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+ ret i64 %new
}
-define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_acquire_i32_shared(
+define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_global_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+ ret i64 %new
}
-define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_seq_cst_i32_generic(
+define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_shared_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+ ret i64 %new
}
-define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_seq_cst_i32_global(
+define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_shared_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+ ret i64 %new
}
-define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acquire_seq_cst_i32_shared(
+define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_shared_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+ ret i64 %new
}
-define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_monotonic_i32_generic(
+define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_generic_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire
+ ret i64 %new
}
-define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_monotonic_i32_global(
+define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_generic_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+ ret i64 %new
}
-define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_monotonic_i32_shared(
+define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_generic_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+ ret i64 %new
}
-define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_acquire_i32_generic(
+define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_global_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+ ret i64 %new
}
-define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_acquire_i32_global(
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_global_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+ ret i64 %new
}
-define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_acquire_i32_shared(
+define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_global_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+ ret i64 %new
}
-define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_seq_cst_i32_generic(
+define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_shared_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+ ret i64 %new
}
-define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_seq_cst_i32_global(
+define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_shared_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+ ret i64 %new
}
-define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: release_seq_cst_i32_shared(
+define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_shared_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+ ret i64 %new
}
-define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_monotonic_i32_generic(
+define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_generic_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_monotonic_i32_global(
+define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_generic_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_monotonic_i32_shared(
+define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_generic_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_acquire_i32_generic(
+define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_global_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_acquire_i32_global(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_global_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_acquire_i32_shared(
+define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_global_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i32_generic(
+define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_shared_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i32_global(
+define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_shared_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i32_shared(
+define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_shared_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+ ret i64 %new
}
-define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_monotonic_i32_generic(
+define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_generic_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_monotonic_i32_global(
+define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_generic_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_monotonic_i32_shared(
+define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_generic_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_acquire_i32_generic(
+define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_global_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_acquire_i32_global(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_global_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_acquire_i32_shared(
+define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_global_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i32_generic(
+define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_shared_sys(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
-; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i32_global(
+define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_shared_cta(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
-; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i32_shared(
+define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_shared_gpu(
; SM60: {
-; SM60-NEXT: .reg .b32 %r<4>;
-; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+ ret i64 %new
}
-define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_monotonic_i64_generic(
+define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_generic_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2];
; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_monotonic_i64_global(
+define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_generic_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
-; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_monotonic_i64_shared(
+define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_acquire_i64_generic(
+define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_global_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
-; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_acquire_i64_global(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_global_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_acquire_i64_shared(
+define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_global_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_seq_cst_i64_generic(
+define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_shared_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
-; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_seq_cst_i64_global(
+define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_shared_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
-; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: monotonic_seq_cst_i64_shared(
+define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2];
; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
ret i64 %new
}
-define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_monotonic_i64_generic(
+define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_generic_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2];
; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_monotonic_i64_global(
+define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_generic_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
-; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_monotonic_i64_shared(
+define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_acquire_i64_generic(
+define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_global_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
-; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_acquire_i64_global(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_global_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_acquire_i64_shared(
+define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_global_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_seq_cst_i64_generic(
+define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_shared_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
-; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_seq_cst_i64_global(
+define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_shared_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
-; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acquire_seq_cst_i64_shared(
+define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2];
; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
ret i64 %new
}
-define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_monotonic_i64_generic(
+define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_generic_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2];
; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
ret i64 %new
}
-define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_monotonic_i64_global(
+define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_generic_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
-; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
ret i64 %new
}
-define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_monotonic_i64_shared(
+define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
ret i64 %new
}
-define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_acquire_i64_generic(
+define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_global_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
-; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
ret i64 %new
}
-define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_acquire_i64_global(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_global_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
ret i64 %new
}
-define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_acquire_i64_shared(
+define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_global_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
ret i64 %new
}
-define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_seq_cst_i64_generic(
+define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_shared_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
-; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
ret i64 %new
}
-define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_seq_cst_i64_global(
+define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_shared_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
-; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
ret i64 %new
}
-define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: release_seq_cst_i64_shared(
+define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2];
; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
ret i64 %new
}
-define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_monotonic_i64_generic(
+define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_generic_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2];
; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_monotonic_i64_global(
+define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_generic_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
-; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_monotonic_i64_shared(
+define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_acquire_i64_generic(
+define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_global_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
-; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_acquire_i64_global(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_global_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_acquire_i64_shared(
+define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_global_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i64_generic(
+define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_shared_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
-; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i64_global(
+define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_shared_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
-; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: acq_rel_seq_cst_i64_shared(
+define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2];
; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
ret i64 %new
}
-define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_monotonic_i64_generic(
+define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_generic_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2];
; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_monotonic_i64_global(
+define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_generic_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
-; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_monotonic_i64_shared(
+define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_generic_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_acquire_i64_generic(
+define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_global_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
-; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_acquire_i64_global(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_global_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_acquire_i64_shared(
+define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_global_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
-; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i64_generic(
+define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_shared_sys(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0];
; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
-; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i64_global(
+define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_shared_cta(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
-; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM60-LABEL: seq_cst_seq_cst_i64_shared(
+define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_shared_gpu(
; SM60: {
; SM60-NEXT: .reg .b64 %rd<5>;
; SM60-EMPTY:
; SM60-NEXT: // %bb.0:
-; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
-; SM60-NEXT: membar.sys;
-; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0];
+; SM60-NEXT: membar.gl;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2];
; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
; SM60-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
ret i64 %new
}
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
index 4360ea36e863a..d8f961be05ab0 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
-define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_monotonic_i8_generic(
+define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_generic_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -31,7 +31,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB0_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_monotonic_i8_global(
+define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_generic_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB1_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_monotonic_i8_shared(
+define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB2_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_acquire_i8_generic(
+define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_global_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -158,15 +158,15 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB3_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB3_1;
; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_acquire_i8_global(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -204,7 +203,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -212,7 +211,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB4_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB4_1;
; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_acquire_i8_shared(
+define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_global_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB5_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB5_1;
; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_seq_cst_i8_generic(
+define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_shared_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB6_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB6_1;
; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_seq_cst_i8_global(
+define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_shared_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -344,15 +338,15 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB7_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB7_1;
; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: monotonic_seq_cst_i8_shared(
+define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -391,7 +383,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -399,7 +391,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB8_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB8_1;
; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
ret i8 %new
}
-define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_monotonic_i8_generic(
+define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_generic_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -437,7 +428,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -445,7 +436,7 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB9_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -458,12 +449,12 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
ret i8 %new
}
-define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_monotonic_i8_global(
+define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_generic_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -471,8 +462,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -483,15 +474,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB10_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -501,15 +492,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB10_1;
; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
ret i8 %new
}
-define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_monotonic_i8_shared(
+define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -517,8 +508,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -529,15 +520,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB11_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -547,15 +538,15 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB11_1;
; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
ret i8 %new
}
-define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_acquire_i8_generic(
+define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_global_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -563,8 +554,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -575,15 +566,15 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB12_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -596,12 +587,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
ret i8 %new
}
-define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_acquire_i8_global(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -609,8 +600,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -621,7 +612,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -629,7 +620,7 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB13_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -639,15 +630,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB13_1;
; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
ret i8 %new
}
-define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_acquire_i8_shared(
+define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_global_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -655,8 +646,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -667,15 +658,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB14_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -685,15 +676,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB14_1;
; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
ret i8 %new
}
-define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_seq_cst_i8_generic(
+define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_shared_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -701,9 +692,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -714,15 +704,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB15_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -735,12 +725,12 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
ret i8 %new
}
-define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_seq_cst_i8_global(
+define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_shared_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -748,9 +738,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -761,15 +750,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB16_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -779,15 +768,15 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB16_1;
; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
ret i8 %new
}
-define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acquire_seq_cst_i8_shared(
+define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -795,9 +784,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -808,7 +796,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -816,7 +804,7 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB17_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -826,15 +814,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB17_1;
; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
ret i8 %new
}
-define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_monotonic_i8_generic(
+define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_generic_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -842,9 +830,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -855,7 +843,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -863,7 +851,7 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB18_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -873,14 +861,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB18_1;
; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
ret i8 %new
}
-define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_monotonic_i8_global(
+define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_generic_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -888,9 +877,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -901,15 +890,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB19_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -919,14 +908,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB19_1;
; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
ret i8 %new
}
-define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_monotonic_i8_shared(
+define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -934,9 +924,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -947,15 +937,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB20_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -965,14 +955,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB20_1;
; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
ret i8 %new
}
-define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_acquire_i8_generic(
+define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_global_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -980,9 +971,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -993,15 +984,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB21_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1014,12 +1005,12 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
ret i8 %new
}
-define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_acquire_i8_global(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1027,9 +1018,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1040,7 +1031,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -1048,7 +1039,7 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB22_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1058,15 +1049,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB22_1;
; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
ret i8 %new
}
-define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_acquire_i8_shared(
+define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_global_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1074,9 +1065,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1087,15 +1078,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB23_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1105,15 +1096,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB23_1;
; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
ret i8 %new
}
-define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_seq_cst_i8_generic(
+define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_shared_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1121,8 +1112,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0];
; SM70-NEXT: fence.sc.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
@@ -1134,15 +1125,15 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB24_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1155,12 +1146,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
ret i8 %new
}
-define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_seq_cst_i8_global(
+define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_shared_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1168,9 +1159,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1181,15 +1172,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB25_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1199,15 +1190,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB25_1;
; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
ret i8 %new
}
-define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: release_seq_cst_i8_shared(
+define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1215,9 +1206,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1228,7 +1219,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -1236,7 +1227,7 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB26_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1246,15 +1237,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB26_1;
; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
ret i8 %new
}
-define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_monotonic_i8_generic(
+define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_generic_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1262,9 +1253,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1275,7 +1265,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -1283,7 +1273,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB27_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1296,12 +1286,12 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_monotonic_i8_global(
+define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_generic_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1309,9 +1299,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1322,15 +1311,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB28_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1340,15 +1329,15 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB28_1;
; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_monotonic_i8_shared(
+define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1356,9 +1345,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1369,15 +1357,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB29_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1387,15 +1375,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB29_1;
; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_acquire_i8_generic(
+define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_global_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1403,9 +1391,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1416,15 +1403,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB30_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1437,12 +1424,12 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_acquire_i8_global(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1450,9 +1437,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1463,7 +1449,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -1471,7 +1457,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB31_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1481,15 +1467,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB31_1;
; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_acquire_i8_shared(
+define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_global_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1497,9 +1483,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1510,15 +1495,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB32_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1528,15 +1513,15 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB32_1;
; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i8_generic(
+define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_shared_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1544,9 +1529,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1557,15 +1541,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB33_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1578,12 +1562,12 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i8_global(
+define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_shared_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1591,9 +1575,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1604,15 +1587,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB34_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1622,15 +1605,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB34_1;
; SM70-NEXT: $L__BB34_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
ret i8 %new
}
-define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i8_shared(
+define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1638,9 +1621,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1651,7 +1633,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -1659,7 +1641,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB35_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB35_1;
; SM70-NEXT: $L__BB35_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
ret i8 %new
}
-define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_monotonic_i8_generic(
+define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_generic_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1685,9 +1667,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1698,7 +1679,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -1706,7 +1687,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB36_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1719,12 +1700,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_monotonic_i8_global(
+define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_generic_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1732,9 +1713,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1745,15 +1725,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB37_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB37_1;
; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_monotonic_i8_shared(
+define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1779,9 +1759,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1792,15 +1771,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: ld.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB38_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB38_1;
; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_acquire_i8_generic(
+define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_global_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1826,9 +1805,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1839,15 +1817,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB39_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1860,12 +1838,12 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_acquire_i8_global(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1873,9 +1851,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1886,7 +1863,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -1894,7 +1871,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB40_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1904,15 +1881,15 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB40_1;
; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_acquire_i8_shared(
+define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_global_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1920,9 +1897,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1933,15 +1909,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB41_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB41_1;
; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i8_generic(
+define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_shared_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -1967,9 +1943,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -1980,15 +1955,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB42_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -2001,12 +1976,12 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i8_global(
+define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_shared_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -2014,9 +1989,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -2027,15 +2001,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB43_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB43_1;
; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
ret i8 %new
}
-define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i8_shared(
+define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -2061,9 +2035,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
; SM70-NEXT: and.b32 %r10, %r9, 3;
@@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1];
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
; SM70-NEXT: and.b32 %r20, %r16, %r2;
@@ -2082,7 +2055,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB44_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -2092,3589 +2065,14968 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB44_1;
; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
ret i8 %new
}
-define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_monotonic_i16_generic(
+define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_generic_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB45_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB45_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB45_1;
; SM70-NEXT: $L__BB45_3: // %partword.cmpxchg.end
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_monotonic_i16_global(
+define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_generic_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB46_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB46_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB46_1;
; SM70-NEXT: $L__BB46_3: // %partword.cmpxchg.end
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_monotonic_i16_shared(
+define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB47_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB47_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB47_1;
; SM70-NEXT: $L__BB47_3: // %partword.cmpxchg.end
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_acquire_i16_generic(
+define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_global_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB48_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB48_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB48_1;
; SM70-NEXT: $L__BB48_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_acquire_i16_global(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB49_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB49_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB49_1;
; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_acquire_i16_shared(
+define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_global_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB50_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB50_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB50_1;
; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_seq_cst_i16_generic(
+define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_shared_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB51_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB51_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB51_1;
; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_seq_cst_i16_global(
+define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_shared_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB52_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB52_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB52_1;
; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+ ret i8 %new
}
-define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: monotonic_seq_cst_i16_shared(
+define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB53_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB53_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB53_1;
; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+ ret i8 %new
}
-define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_monotonic_i16_generic(
+define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_generic_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB54_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB54_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB54_1;
; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+ ret i8 %new
}
-define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_monotonic_i16_global(
+define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_generic_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB55_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB55_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB55_1;
; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+ ret i8 %new
}
-define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_monotonic_i16_shared(
+define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB56_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB56_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB56_1;
; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+ ret i8 %new
}
-define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_acquire_i16_generic(
+define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_global_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB57_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB57_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB57_1;
; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+ ret i8 %new
}
-define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_acquire_i16_global(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB58_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB58_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB58_1;
; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+ ret i8 %new
}
-define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_acquire_i16_shared(
+define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_global_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB59_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB59_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB59_1;
; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+ ret i8 %new
}
-define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_seq_cst_i16_generic(
+define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_shared_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB60_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB60_1;
; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+ ret i8 %new
}
-define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_seq_cst_i16_global(
+define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_shared_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB61_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB61_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
-; SM70-NEXT: @%p2 bra $L__BB61_1;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB61_1;
; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+ ret i8 %new
}
-define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acquire_seq_cst_i16_shared(
+define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB62_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB62_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB62_1;
; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+ ret i8 %new
}
-define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_monotonic_i16_generic(
+define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_generic_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0];
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB63_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB63_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB63_1;
; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire
+ ret i8 %new
}
-define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_monotonic_i16_global(
+define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_generic_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB64_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB64_1;
; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+ ret i8 %new
}
-define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_monotonic_i16_shared(
+define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB65_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB65_1;
; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+ ret i8 %new
}
-define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_acquire_i16_generic(
+define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_global_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0];
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB66_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB66_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB66_1;
; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+ ret i8 %new
}
-define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_acquire_i16_global(
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB67_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB67_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB67_1;
; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+ ret i8 %new
}
-define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_acquire_i16_shared(
+define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_global_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB68_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB68_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB68_1;
; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+ ret i8 %new
}
-define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_seq_cst_i16_generic(
+define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_shared_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB69_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB69_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB69_1;
; SM70-NEXT: $L__BB69_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+ ret i8 %new
}
-define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_seq_cst_i16_global(
+define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_shared_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB70_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB70_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB70_1;
; SM70-NEXT: $L__BB70_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+ ret i8 %new
}
-define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: release_seq_cst_i16_shared(
+define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB71_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB71_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB71_1;
; SM70-NEXT: $L__BB71_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+ ret i8 %new
}
-define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_monotonic_i16_generic(
+define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_generic_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB72_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB72_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB72_1;
; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_monotonic_i16_global(
+define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_generic_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB73_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB73_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB73_1;
; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_monotonic_i16_shared(
+define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB74_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB74_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB74_1;
; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_acquire_i16_generic(
+define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_global_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB75_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB75_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB75_1;
; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_acquire_i16_global(
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB76_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB76_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB76_1;
; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_acquire_i16_shared(
+define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_global_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB77_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB77_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB77_1;
; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i16_generic(
+define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_shared_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB78_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB78_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB78_1;
; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i16_global(
+define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_shared_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB79_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB79_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB79_1;
; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i16_shared(
+define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB80_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB80_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB80_1;
; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+ ret i8 %new
}
-define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_monotonic_i16_generic(
+define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_generic_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB81_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB81_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB81_1;
; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_monotonic_i16_global(
+define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_generic_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB82_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB82_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB82_1;
; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_monotonic_i16_shared(
+define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB83_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB83_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB83_1;
; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_acquire_i16_generic(
+define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_global_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB84_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB84_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB84_1;
; SM70-NEXT: $L__BB84_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_acquire_i16_global(
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_global_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB85_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB85_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB85_1;
; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_acquire_i16_shared(
+define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_global_gpu(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB86_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB86_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB86_1;
; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i16_generic(
+define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_shared_sys(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB87_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB87_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB87_1;
; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+ ret i8 %new
}
-define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i16_global(
+define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_shared_cta(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b32 %r<21>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.global.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB88_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB88_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB88_1;
; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB89_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB89_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB89_1;
+; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0];
; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB90_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB90_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB90_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB90_1;
+; SM70-NEXT: $L__BB90_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB91_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB91_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB91_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB91_1;
+; SM70-NEXT: $L__BB91_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB92_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB92_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB92_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB92_1;
+; SM70-NEXT: $L__BB92_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB93_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB93_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB93_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB93_1;
+; SM70-NEXT: $L__BB93_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB94_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB94_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB94_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB94_1;
+; SM70-NEXT: $L__BB94_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB95_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB95_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB95_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB95_1;
+; SM70-NEXT: $L__BB95_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB96_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB96_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB96_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB96_1;
+; SM70-NEXT: $L__BB96_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB97_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB97_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB97_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB97_1;
+; SM70-NEXT: $L__BB97_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB98_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB98_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB98_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB98_1;
+; SM70-NEXT: $L__BB98_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB99_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB99_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB99_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB99_1;
+; SM70-NEXT: $L__BB99_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB100_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB100_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB100_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB100_1;
+; SM70-NEXT: $L__BB100_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB101_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB101_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB101_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB101_1;
+; SM70-NEXT: $L__BB101_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB102_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB102_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB102_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB102_1;
+; SM70-NEXT: $L__BB102_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB103_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB103_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB103_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB103_1;
+; SM70-NEXT: $L__BB103_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB104_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB104_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB104_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB104_1;
+; SM70-NEXT: $L__BB104_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB105_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB105_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB105_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB105_1;
+; SM70-NEXT: $L__BB105_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB106_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB106_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB106_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB106_1;
+; SM70-NEXT: $L__BB106_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB107_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB107_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB107_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB107_1;
+; SM70-NEXT: $L__BB107_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB108_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB108_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB108_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB108_1;
+; SM70-NEXT: $L__BB108_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB109_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB109_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB109_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB109_1;
+; SM70-NEXT: $L__BB109_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB110_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB110_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB110_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB110_1;
+; SM70-NEXT: $L__BB110_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB111_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB111_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB111_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB111_1;
+; SM70-NEXT: $L__BB111_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB112_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB112_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB112_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB112_1;
+; SM70-NEXT: $L__BB112_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB113_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB113_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB113_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB113_1;
+; SM70-NEXT: $L__BB113_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB114_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB114_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB114_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB114_1;
+; SM70-NEXT: $L__BB114_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB115_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB115_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB115_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB115_1;
+; SM70-NEXT: $L__BB115_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB116_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB116_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB116_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB116_1;
+; SM70-NEXT: $L__BB116_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB117_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB117_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB117_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB117_1;
+; SM70-NEXT: $L__BB117_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB118_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB118_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB118_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB118_1;
+; SM70-NEXT: $L__BB118_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB119_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB119_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB119_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB119_1;
+; SM70-NEXT: $L__BB119_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB120_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB120_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB120_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB120_1;
+; SM70-NEXT: $L__BB120_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB121_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB121_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB121_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB121_1;
+; SM70-NEXT: $L__BB121_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB122_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB122_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB122_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB122_1;
+; SM70-NEXT: $L__BB122_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB123_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB123_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB123_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB123_1;
+; SM70-NEXT: $L__BB123_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB124_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB124_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB124_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB124_1;
+; SM70-NEXT: $L__BB124_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB125_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB125_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB125_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB125_1;
+; SM70-NEXT: $L__BB125_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB126_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB126_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB126_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB126_1;
+; SM70-NEXT: $L__BB126_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB127_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB127_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB127_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB127_1;
+; SM70-NEXT: $L__BB127_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB128_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB128_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB128_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB128_1;
+; SM70-NEXT: $L__BB128_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB129_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB129_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB129_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB129_1;
+; SM70-NEXT: $L__BB129_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB130_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB130_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB130_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB130_1;
+; SM70-NEXT: $L__BB130_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB131_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB131_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB131_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB131_1;
+; SM70-NEXT: $L__BB131_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB132_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB132_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB132_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB132_1;
+; SM70-NEXT: $L__BB132_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB133_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB133_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB133_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB133_1;
+; SM70-NEXT: $L__BB133_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB134_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB134_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB134_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB134_1;
+; SM70-NEXT: $L__BB134_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB135_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB135_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB135_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB135_1;
+; SM70-NEXT: $L__BB135_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB136_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB136_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB136_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB136_1;
+; SM70-NEXT: $L__BB136_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB137_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB137_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB137_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB137_1;
+; SM70-NEXT: $L__BB137_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB138_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB138_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB138_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB138_1;
+; SM70-NEXT: $L__BB138_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB139_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB139_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB139_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB139_1;
+; SM70-NEXT: $L__BB139_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB140_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB140_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB140_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB140_1;
+; SM70-NEXT: $L__BB140_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB141_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB141_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB141_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB141_1;
+; SM70-NEXT: $L__BB141_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB142_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB142_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB142_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB142_1;
+; SM70-NEXT: $L__BB142_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB143_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB143_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB143_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB143_1;
+; SM70-NEXT: $L__BB143_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB144_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB144_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB144_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB144_1;
+; SM70-NEXT: $L__BB144_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB145_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB145_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB145_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB145_1;
+; SM70-NEXT: $L__BB145_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB146_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB146_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB146_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB146_1;
+; SM70-NEXT: $L__BB146_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB147_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB147_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB147_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB147_1;
+; SM70-NEXT: $L__BB147_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB148_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB148_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB148_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB148_1;
+; SM70-NEXT: $L__BB148_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB149_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB149_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB149_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB149_1;
+; SM70-NEXT: $L__BB149_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB150_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB150_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB150_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB150_1;
+; SM70-NEXT: $L__BB150_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB151_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB151_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB151_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB151_1;
+; SM70-NEXT: $L__BB151_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB152_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB152_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB152_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB152_1;
+; SM70-NEXT: $L__BB152_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB153_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB153_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB153_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB153_1;
+; SM70-NEXT: $L__BB153_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB154_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB154_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB154_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB154_1;
+; SM70-NEXT: $L__BB154_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB155_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB155_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB155_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB155_1;
+; SM70-NEXT: $L__BB155_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB156_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB156_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB156_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB156_1;
+; SM70-NEXT: $L__BB156_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB157_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB157_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB157_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB157_1;
+; SM70-NEXT: $L__BB157_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB158_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB158_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB158_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB158_1;
+; SM70-NEXT: $L__BB158_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB159_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB159_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB159_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB159_1;
+; SM70-NEXT: $L__BB159_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB160_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB160_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB160_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB160_1;
+; SM70-NEXT: $L__BB160_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB161_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB161_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB161_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB161_1;
+; SM70-NEXT: $L__BB161_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB162_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB162_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB162_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB162_1;
+; SM70-NEXT: $L__BB162_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB163_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB163_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB163_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB163_1;
+; SM70-NEXT: $L__BB163_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB164_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB164_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB164_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB164_1;
+; SM70-NEXT: $L__BB164_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB165_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB165_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB165_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB165_1;
+; SM70-NEXT: $L__BB165_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB166_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB166_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB166_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB166_1;
+; SM70-NEXT: $L__BB166_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB167_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB167_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB167_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB167_1;
+; SM70-NEXT: $L__BB167_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB168_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB168_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB168_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB168_1;
+; SM70-NEXT: $L__BB168_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB169_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB169_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB169_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB169_1;
+; SM70-NEXT: $L__BB169_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB170_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB170_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB170_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB170_1;
+; SM70-NEXT: $L__BB170_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB171_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB171_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB171_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB171_1;
+; SM70-NEXT: $L__BB171_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB172_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB172_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB172_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB172_1;
+; SM70-NEXT: $L__BB172_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB173_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB173_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB173_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB173_1;
+; SM70-NEXT: $L__BB173_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB174_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB174_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB174_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB174_1;
+; SM70-NEXT: $L__BB174_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB175_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB175_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB175_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB175_1;
+; SM70-NEXT: $L__BB175_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB176_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB176_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB176_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB176_1;
+; SM70-NEXT: $L__BB176_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB177_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB177_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB177_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB177_1;
+; SM70-NEXT: $L__BB177_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB178_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB178_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB178_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB178_1;
+; SM70-NEXT: $L__BB178_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB179_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB179_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB179_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB179_1;
+; SM70-NEXT: $L__BB179_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB180_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB180_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB180_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB180_1;
+; SM70-NEXT: $L__BB180_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB181_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB181_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB181_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB181_1;
+; SM70-NEXT: $L__BB181_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB182_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB182_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB182_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB182_1;
+; SM70-NEXT: $L__BB182_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB183_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB183_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB183_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB183_1;
+; SM70-NEXT: $L__BB183_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB184_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB184_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB184_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB184_1;
+; SM70-NEXT: $L__BB184_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB185_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB185_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB185_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB185_1;
+; SM70-NEXT: $L__BB185_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB186_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB186_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB186_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB186_1;
+; SM70-NEXT: $L__BB186_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB187_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB187_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB187_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB187_1;
+; SM70-NEXT: $L__BB187_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB188_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB188_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB188_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB188_1;
+; SM70-NEXT: $L__BB188_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB189_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB189_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB189_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB189_1;
+; SM70-NEXT: $L__BB189_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB190_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB190_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB190_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB190_1;
+; SM70-NEXT: $L__BB190_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB191_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB191_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB191_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB191_1;
+; SM70-NEXT: $L__BB191_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB192_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB192_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB192_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB192_1;
+; SM70-NEXT: $L__BB192_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB193_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB193_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB193_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB193_1;
+; SM70-NEXT: $L__BB193_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB194_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB194_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB194_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB194_1;
+; SM70-NEXT: $L__BB194_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB195_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB195_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB195_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB195_1;
+; SM70-NEXT: $L__BB195_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB196_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB196_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB196_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB196_1;
+; SM70-NEXT: $L__BB196_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB197_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB197_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB197_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB197_1;
+; SM70-NEXT: $L__BB197_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB198_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB198_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB198_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB198_1;
+; SM70-NEXT: $L__BB198_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB199_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB199_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB199_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB199_1;
+; SM70-NEXT: $L__BB199_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB200_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB200_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB200_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB200_1;
+; SM70-NEXT: $L__BB200_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB201_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB201_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB201_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB201_1;
+; SM70-NEXT: $L__BB201_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB202_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB202_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB202_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB202_1;
+; SM70-NEXT: $L__BB202_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB203_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB203_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB203_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB203_1;
+; SM70-NEXT: $L__BB203_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB204_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB204_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB204_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB204_1;
+; SM70-NEXT: $L__BB204_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB205_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB205_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB205_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB205_1;
+; SM70-NEXT: $L__BB205_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB206_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB206_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB206_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB206_1;
+; SM70-NEXT: $L__BB206_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB207_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB207_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB207_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB207_1;
+; SM70-NEXT: $L__BB207_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB208_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB208_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB208_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB208_1;
+; SM70-NEXT: $L__BB208_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB209_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB209_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB209_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB209_1;
+; SM70-NEXT: $L__BB209_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB210_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB210_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB210_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB210_1;
+; SM70-NEXT: $L__BB210_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB211_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB211_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB211_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB211_1;
+; SM70-NEXT: $L__BB211_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB212_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB212_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB212_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB212_1;
+; SM70-NEXT: $L__BB212_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB213_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB213_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB213_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB213_1;
+; SM70-NEXT: $L__BB213_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB214_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB214_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB214_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB214_1;
+; SM70-NEXT: $L__BB214_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB215_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB215_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB215_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB215_1;
+; SM70-NEXT: $L__BB215_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB216_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB216_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB216_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB216_1;
+; SM70-NEXT: $L__BB216_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB217_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB217_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB217_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB217_1;
+; SM70-NEXT: $L__BB217_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB218_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB218_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB218_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB218_1;
+; SM70-NEXT: $L__BB218_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB219_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB219_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB219_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB219_1;
+; SM70-NEXT: $L__BB219_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB220_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB220_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB220_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB220_1;
+; SM70-NEXT: $L__BB220_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB221_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB221_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB221_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB221_1;
+; SM70-NEXT: $L__BB221_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB222_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB222_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB222_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB222_1;
+; SM70-NEXT: $L__BB222_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB223_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB223_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB223_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB223_1;
+; SM70-NEXT: $L__BB223_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB224_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB224_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB224_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB224_1;
+; SM70-NEXT: $L__BB224_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB225_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB225_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB225_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB225_1;
+; SM70-NEXT: $L__BB225_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB226_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB226_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB226_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB226_1;
+; SM70-NEXT: $L__BB226_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB227_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB227_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB227_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB227_1;
+; SM70-NEXT: $L__BB227_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB228_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB228_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB228_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB228_1;
+; SM70-NEXT: $L__BB228_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB229_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB229_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB229_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB229_1;
+; SM70-NEXT: $L__BB229_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB230_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB230_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB230_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB230_1;
+; SM70-NEXT: $L__BB230_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB231_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB231_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB231_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB231_1;
+; SM70-NEXT: $L__BB231_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB232_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB232_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB232_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB232_1;
+; SM70-NEXT: $L__BB232_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0];
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB233_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB233_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB233_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB233_1;
+; SM70-NEXT: $L__BB233_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB234_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB234_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB234_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB234_1;
+; SM70-NEXT: $L__BB234_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB235_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB235_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB235_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB235_1;
+; SM70-NEXT: $L__BB235_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB236_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB236_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB236_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB236_1;
+; SM70-NEXT: $L__BB236_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB237_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB237_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB237_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB237_1;
+; SM70-NEXT: $L__BB237_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB238_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB238_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB238_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB238_1;
+; SM70-NEXT: $L__BB238_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB239_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB239_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB239_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB239_1;
+; SM70-NEXT: $L__BB239_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB240_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB240_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB240_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB240_1;
+; SM70-NEXT: $L__BB240_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB241_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB241_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB241_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB241_1;
+; SM70-NEXT: $L__BB241_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB242_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB242_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB242_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB242_1;
+; SM70-NEXT: $L__BB242_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB243_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB243_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB243_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB243_1;
+; SM70-NEXT: $L__BB243_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB244_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB244_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB244_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB244_1;
+; SM70-NEXT: $L__BB244_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB245_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB245_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB245_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB245_1;
+; SM70-NEXT: $L__BB245_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB246_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB246_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB246_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB246_1;
+; SM70-NEXT: $L__BB246_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB247_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB247_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB247_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB247_1;
+; SM70-NEXT: $L__BB247_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB248_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB248_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB248_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB248_1;
+; SM70-NEXT: $L__BB248_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB249_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB249_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB249_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB249_1;
+; SM70-NEXT: $L__BB249_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB250_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB250_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB250_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB250_1;
+; SM70-NEXT: $L__BB250_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB251_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB251_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB251_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB251_1;
+; SM70-NEXT: $L__BB251_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB252_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB252_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB252_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB252_1;
+; SM70-NEXT: $L__BB252_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB253_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB253_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB253_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB253_1;
+; SM70-NEXT: $L__BB253_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB254_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB254_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB254_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB254_1;
+; SM70-NEXT: $L__BB254_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB255_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB255_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB255_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB255_1;
+; SM70-NEXT: $L__BB255_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB256_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB256_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB256_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB256_1;
+; SM70-NEXT: $L__BB256_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB257_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB257_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB257_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB257_1;
+; SM70-NEXT: $L__BB257_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB258_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB258_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB258_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB258_1;
+; SM70-NEXT: $L__BB258_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB259_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB259_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB259_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB259_1;
+; SM70-NEXT: $L__BB259_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB260_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB260_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB260_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB260_1;
+; SM70-NEXT: $L__BB260_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB261_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB261_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB261_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB261_1;
+; SM70-NEXT: $L__BB261_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB262_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB262_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB262_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB262_1;
+; SM70-NEXT: $L__BB262_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB263_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB263_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB263_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB263_1;
+; SM70-NEXT: $L__BB263_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB264_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB264_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB264_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB264_1;
+; SM70-NEXT: $L__BB264_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB265_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB265_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB265_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB265_1;
+; SM70-NEXT: $L__BB265_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB266_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB266_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB266_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB266_1;
+; SM70-NEXT: $L__BB266_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB267_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB267_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB267_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB267_1;
+; SM70-NEXT: $L__BB267_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB268_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB268_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB268_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB268_1;
+; SM70-NEXT: $L__BB268_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB269_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB269_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB269_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB269_1;
+; SM70-NEXT: $L__BB269_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2];
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2];
+; SM70-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2];
+; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2];
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2];
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2];
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2];
+; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2];
+; SM70-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2];
+; SM70-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2];
+; SM70-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2];
+; SM70-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2];
+; SM70-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2];
+; SM70-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2];
+; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2];
+; SM70-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2];
+; SM70-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
+; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2];
+; SM70-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2];
+; SM70-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2];
+; SM70-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_generic_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+ ret i64 %new
}
-define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i16_shared(
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_global_cta(
; SM70: {
-; SM70-NEXT: .reg .pred %p<3>;
-; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<20>;
-; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
-; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM70-NEXT: and.b32 %r11, %r10, 3;
-; SM70-NEXT: shl.b32 %r1, %r11, 3;
-; SM70-NEXT: mov.b32 %r12, 65535;
-; SM70-NEXT: shl.b32 %r13, %r12, %r1;
-; SM70-NEXT: not.b32 %r2, %r13;
-; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
-; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: shl.b32 %r4, %r9, %r1;
-; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM70-NEXT: and.b32 %r19, %r15, %r2;
-; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop
-; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r16, %r19, %r3;
-; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
-; SM70-NEXT: @%p1 bra $L__BB89_3;
-; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT: // in Loop: Header=BB89_1 Depth=1
-; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM70-NEXT: mov.u32 %r19, %r8;
-; SM70-NEXT: @%p2 bra $L__BB89_1;
-; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end
-; SM70-NEXT: fence.acq_rel.sys;
-; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+ ret i64 %new
}
-define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_monotonic_i32_generic(
+define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_global_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
-; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+ ret i64 %new
}
-define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_monotonic_i32_global(
+define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_shared_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
-; SM70-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+ ret i64 %new
}
-define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_monotonic_i32_shared(
+define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_shared_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
-; SM70-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+ ret i64 %new
}
-define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_acquire_i32_generic(
+define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_shared_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+ ret i64 %new
}
-define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_acquire_i32_global(
+define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_generic_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+ ret i64 %new
}
-define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_acquire_i32_shared(
+define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_generic_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+ ret i64 %new
}
-define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_seq_cst_i32_generic(
+define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_generic_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_shared_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_shared_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_generic_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+ ret i64 %new
}
-define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_seq_cst_i32_global(
+define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_generic_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_generic_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_global_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+ ret i64 %new
}
-define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: monotonic_seq_cst_i32_shared(
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_global_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_shared_sys(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+ ret i64 %new
}
-define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_monotonic_i32_generic(
+define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_shared_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+ ret i64 %new
}
-define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_monotonic_i32_global(
+define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_shared_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+ ret i64 %new
}
-define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_monotonic_i32_shared(
+define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_generic_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2];
+; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+ ret i64 %new
}
-define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_acquire_i32_generic(
+define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_generic_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2];
+; SM70-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+ ret i64 %new
}
-define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_acquire_i32_global(
+define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_generic_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+ ret i64 %new
}
-define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_acquire_i32_shared(
+define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_global_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2];
+; SM70-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2];
+; SM70-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_global_gpu(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2];
+; SM70-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+ ret i64 %new
}
-define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_seq_cst_i32_generic(
+define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_shared_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2];
+; SM70-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+ ret i64 %new
}
-define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_seq_cst_i32_global(
+define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_shared_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2];
+; SM70-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+ ret i64 %new
}
-define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acquire_seq_cst_i32_shared(
+define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_shared_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+ ret i64 %new
}
-define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_monotonic_i32_generic(
+define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_generic_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
-; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire
+ ret i64 %new
}
-define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_monotonic_i32_global(
+define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_generic_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
-; SM70-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+ ret i64 %new
}
-define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_monotonic_i32_shared(
+define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_generic_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
-; SM70-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+ ret i64 %new
}
-define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_acquire_i32_generic(
+define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_global_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
-; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+ ret i64 %new
}
-define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_acquire_i32_global(
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_global_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2];
-; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+ ret i64 %new
}
-define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_acquire_i32_shared(
+define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_global_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
-; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+ ret i64 %new
}
-define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_seq_cst_i32_generic(
+define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_shared_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+ ret i64 %new
}
-define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_seq_cst_i32_global(
+define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_shared_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+ ret i64 %new
}
-define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: release_seq_cst_i32_shared(
+define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_shared_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+ ret i64 %new
}
-define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_monotonic_i32_generic(
+define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_generic_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
-; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_monotonic_i32_global(
+define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_generic_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
-; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_monotonic_i32_shared(
+define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_generic_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
-; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_acquire_i32_generic(
+define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_global_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
-; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_acquire_i32_global(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_global_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
-; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_acquire_i32_shared(
+define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_global_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
-; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i32_generic(
+define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_shared_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i32_global(
+define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_shared_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+ ret i64 %new
}
-define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i32_shared(
+define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_shared_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+ ret i64 %new
}
-define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_monotonic_i32_generic(
+define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_generic_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_monotonic_i32_global(
+define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_generic_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_monotonic_i32_shared(
+define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_generic_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_acquire_i32_generic(
+define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_global_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_acquire_i32_global(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_global_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_acquire_i32_shared(
+define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_global_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i32_generic(
+define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_shared_sys(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i32_global(
+define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_shared_cta(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+ ret i64 %new
}
-define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i32_shared(
+define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_shared_gpu(
; SM70: {
-; SM70-NEXT: .reg .b32 %r<4>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+ ret i64 %new
}
-define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_monotonic_i64_generic(
+define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_generic_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
-; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_monotonic_i64_global(
+define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_generic_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
-; SM70-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_monotonic_i64_shared(
+define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
-; SM70-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_acquire_i64_generic(
+define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_global_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_acquire_i64_global(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_acquire_i64_shared(
+define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_global_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_seq_cst_i64_generic(
+define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_shared_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_seq_cst_i64_global(
+define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_shared_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
ret i64 %new
}
-define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: monotonic_seq_cst_i64_shared(
+define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
ret i64 %new
}
-define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_monotonic_i64_generic(
+define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_generic_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_monotonic_i64_global(
+define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_generic_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_monotonic_i64_shared(
+define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_acquire_i64_generic(
+define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_global_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_acquire_i64_global(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_acquire_i64_shared(
+define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_global_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_seq_cst_i64_generic(
+define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_shared_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_seq_cst_i64_global(
+define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_shared_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acquire_seq_cst_i64_shared(
+define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
ret i64 %new
}
-define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_monotonic_i64_generic(
+define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_generic_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
-; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
ret i64 %new
}
-define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_monotonic_i64_global(
+define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_generic_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
-; SM70-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
ret i64 %new
}
-define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_monotonic_i64_shared(
+define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
-; SM70-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
ret i64 %new
}
-define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_acquire_i64_generic(
+define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_global_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
-; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
ret i64 %new
}
-define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_acquire_i64_global(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
-; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
ret i64 %new
}
-define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_acquire_i64_shared(
+define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_global_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
-; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
ret i64 %new
}
-define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_seq_cst_i64_generic(
+define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_shared_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
ret i64 %new
}
-define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_seq_cst_i64_global(
+define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_shared_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
ret i64 %new
}
-define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: release_seq_cst_i64_shared(
+define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
ret i64 %new
}
-define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_monotonic_i64_generic(
+define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_generic_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
-; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_monotonic_i64_global(
+define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_generic_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
-; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_monotonic_i64_shared(
+define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
-; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_acquire_i64_generic(
+define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_global_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
-; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_acquire_i64_global(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
-; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_acquire_i64_shared(
+define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_global_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
-; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i64_generic(
+define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_shared_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i64_global(
+define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_shared_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: acq_rel_seq_cst_i64_shared(
+define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
ret i64 %new
}
-define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_monotonic_i64_generic(
+define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_generic_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_monotonic_i64_global(
+define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_generic_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_monotonic_i64_shared(
+define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_generic_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_acquire_i64_generic(
+define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_global_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_acquire_i64_global(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_global_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_acquire_i64_shared(
+define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_global_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i64_generic(
+define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_shared_sys(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0];
; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2];
+; SM70-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i64_global(
+define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_shared_cta(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
-; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2];
+; SM70-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM70-LABEL: seq_cst_seq_cst_i64_shared(
+define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_shared_gpu(
; SM70: {
; SM70-NEXT: .reg .b64 %rd<5>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
-; SM70-NEXT: fence.sc.sys;
-; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
-; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0];
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2];
+; SM70-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
ret i64 %new
}
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
index 5acb275a6f581..8eae5bfb0a133 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
@@ -2,8 +2,8 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
-define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_monotonic_i8_generic(
+define i8 @monotonic_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_generic_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -23,7 +23,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -31,7 +31,7 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB0_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -43,12 +43,12 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_monotonic_i8_global(
+define i8 @monotonic_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_generic_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -68,15 +68,15 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cta_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB1_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -88,12 +88,12 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne
; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_monotonic_i8_shared(
+define i8 @monotonic_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_generic_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -113,15 +113,15 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_cluster_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB2_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -133,12 +133,12 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne
; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_acquire_i8_generic(
+define i8 @monotonic_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -158,7 +158,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_gpu_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -166,7 +166,7 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB3_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -176,15 +176,14 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB3_1;
; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_acquire_i8_global(
+define i8 @monotonic_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -192,8 +191,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -204,7 +203,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -212,7 +211,7 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB4_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -222,15 +221,14 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB4_1;
; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_acquire_i8_shared(
+define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -238,8 +236,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -250,15 +248,15 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cta_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB5_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -268,15 +266,14 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB5_1;
; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_seq_cst_i8_generic(
+define i8 @monotonic_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -284,9 +281,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -297,15 +293,15 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_cluster_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB6_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -315,15 +311,14 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB6_1;
; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_seq_cst_i8_global(
+define i8 @monotonic_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -331,9 +326,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -344,7 +338,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_gpu_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -352,7 +346,7 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB7_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -362,15 +356,14 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB7_1;
; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
ret i8 %new
}
-define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: monotonic_seq_cst_i8_shared(
+define i8 @monotonic_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_shared_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -378,9 +371,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -391,7 +383,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -399,7 +391,7 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB8_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -409,15 +401,14 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB8_1;
; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic monotonic
ret i8 %new
}
-define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_monotonic_i8_generic(
+define i8 @monotonic_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_shared_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -425,8 +416,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -437,15 +428,15 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cta_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB9_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -455,15 +446,14 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB9_1;
; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic monotonic
ret i8 %new
}
-define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_monotonic_i8_global(
+define i8 @monotonic_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_shared_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -471,8 +461,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -483,15 +473,15 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_cluster_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB10_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -501,15 +491,14 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB10_1;
; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic monotonic
ret i8 %new
}
-define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_monotonic_i8_shared(
+define i8 @monotonic_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_shared_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -517,8 +506,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -529,7 +518,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_gpu_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -537,7 +526,7 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB11_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -547,15 +536,14 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB11_1;
; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic monotonic
ret i8 %new
}
-define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_acquire_i8_generic(
+define i8 @monotonic_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_generic_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -563,8 +551,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -575,7 +563,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -583,7 +571,7 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB12_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -596,12 +584,12 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
ret i8 %new
}
-define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_acquire_i8_global(
+define i8 @monotonic_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_generic_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -609,8 +597,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -621,15 +609,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cta_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB13_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -639,15 +627,15 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB13_1;
; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cta;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
ret i8 %new
}
-define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_acquire_i8_shared(
+define i8 @monotonic_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_generic_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -655,8 +643,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -667,15 +655,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_cluster_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB14_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -685,15 +673,15 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB14_1;
; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cluster;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire
ret i8 %new
}
-define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_seq_cst_i8_generic(
+define i8 @monotonic_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -701,9 +689,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -714,7 +701,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_gpu_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -722,7 +709,7 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB15_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -732,15 +719,15 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB15_1;
; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.gpu;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
ret i8 %new
}
-define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_seq_cst_i8_global(
+define i8 @monotonic_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -748,9 +735,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -761,7 +747,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -769,7 +755,7 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB16_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -782,12 +768,12 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
ret i8 %new
}
-define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acquire_seq_cst_i8_shared(
+define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -795,9 +781,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -808,15 +793,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cta_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB17_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -826,15 +811,15 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB17_1;
; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cta;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
ret i8 %new
}
-define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_monotonic_i8_generic(
+define i8 @monotonic_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -842,9 +827,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -855,15 +839,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_cluster_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB18_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -873,14 +857,15 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB18_1;
; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire
ret i8 %new
}
-define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_monotonic_i8_global(
+define i8 @monotonic_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -888,9 +873,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -901,7 +885,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_gpu_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -909,7 +893,7 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB19_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -919,14 +903,15 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB19_1;
; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
ret i8 %new
}
-define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_monotonic_i8_shared(
+define i8 @monotonic_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_shared_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -934,9 +919,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -947,7 +931,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -955,7 +939,7 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB20_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -965,14 +949,15 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB20_1;
; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic acquire
ret i8 %new
}
-define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_acquire_i8_generic(
+define i8 @monotonic_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_shared_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -980,9 +965,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -993,15 +977,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cta_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB21_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1011,15 +995,15 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB21_1;
; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cta;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic acquire
ret i8 %new
}
-define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_acquire_i8_global(
+define i8 @monotonic_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_shared_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1027,9 +1011,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1040,15 +1023,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_cluster_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB22_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1058,15 +1041,15 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB22_1;
; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cluster;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic acquire
ret i8 %new
}
-define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_acquire_i8_shared(
+define i8 @monotonic_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_shared_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1074,9 +1057,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1087,7 +1069,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_gpu_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -1095,7 +1077,7 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB23_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1105,15 +1087,15 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB23_1;
; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.gpu;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic acquire
ret i8 %new
}
-define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_seq_cst_i8_generic(
+define i8 @monotonic_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_generic_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1121,8 +1103,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_sys_param_0];
; SM90-NEXT: fence.sc.sys;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
@@ -1134,7 +1116,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -1142,7 +1124,7 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB24_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1155,12 +1137,12 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
ret i8 %new
}
-define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_seq_cst_i8_global(
+define i8 @monotonic_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_generic_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1168,9 +1150,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1181,15 +1163,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cta_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB25_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1199,15 +1181,15 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB25_1;
; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cta;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
ret i8 %new
}
-define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: release_seq_cst_i8_shared(
+define i8 @monotonic_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_generic_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1215,9 +1197,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1228,15 +1210,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_cluster_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB26_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1246,15 +1228,15 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB26_1;
; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cluster;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst
ret i8 %new
}
-define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_monotonic_i8_generic(
+define i8 @monotonic_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1262,9 +1244,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1275,7 +1257,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_gpu_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -1283,7 +1265,7 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB27_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1293,15 +1275,15 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB27_1;
; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.gpu;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
ret i8 %new
}
-define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_monotonic_i8_global(
+define i8 @monotonic_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1309,9 +1291,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1322,7 +1304,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -1330,7 +1312,7 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB28_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1343,12 +1325,12 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
ret i8 %new
}
-define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_monotonic_i8_shared(
+define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1356,9 +1338,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1369,15 +1351,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cta_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB29_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1387,15 +1369,15 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB29_1;
; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cta;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
ret i8 %new
}
-define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_acquire_i8_generic(
+define i8 @monotonic_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1403,9 +1385,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1416,15 +1398,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_cluster_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB30_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1434,15 +1416,15 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB30_1;
; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cluster;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst
ret i8 %new
}
-define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_acquire_i8_global(
+define i8 @monotonic_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1450,9 +1432,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1463,7 +1445,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_gpu_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -1471,7 +1453,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB31_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1481,15 +1463,15 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB31_1;
; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.gpu;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
ret i8 %new
}
-define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_acquire_i8_shared(
+define i8 @monotonic_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_shared_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1497,9 +1479,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
-; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1510,7 +1492,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -1518,7 +1500,7 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB32_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1531,12 +1513,12 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") monotonic seq_cst
ret i8 %new
}
-define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i8_generic(
+define i8 @monotonic_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_shared_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1544,9 +1526,9 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1557,15 +1539,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cta_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB33_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1575,15 +1557,15 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB33_1;
; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cta;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") monotonic seq_cst
ret i8 %new
}
-define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i8_global(
+define i8 @monotonic_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_shared_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1591,9 +1573,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1604,15 +1586,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_cluster_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB34_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1622,15 +1604,15 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB34_1;
; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cluster;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") monotonic seq_cst
ret i8 %new
}
-define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i8_shared(
+define i8 @monotonic_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_shared_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1638,9 +1620,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1651,7 +1633,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_gpu_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -1659,7 +1641,7 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB35_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1669,15 +1651,15 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB35_1;
; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.gpu;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") monotonic seq_cst
ret i8 %new
}
-define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_monotonic_i8_generic(
+define i8 @acquire_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_generic_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1685,9 +1667,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1698,7 +1679,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -1706,7 +1687,7 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB36_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1719,12 +1700,12 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
ret i8 %new
}
-define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_monotonic_i8_global(
+define i8 @acquire_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_generic_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1732,9 +1713,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1745,15 +1725,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cta_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB37_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1763,15 +1743,15 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB37_1;
; SM90-NEXT: $L__BB37_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cta;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
ret i8 %new
}
-define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_monotonic_i8_shared(
+define i8 @acquire_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_generic_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1779,9 +1759,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1792,15 +1771,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_cluster_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB38_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1810,15 +1789,15 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new)
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB38_1;
; SM90-NEXT: $L__BB38_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cluster;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic
ret i8 %new
}
-define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_acquire_i8_generic(
+define i8 @acquire_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1826,9 +1805,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1839,7 +1817,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_gpu_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -1847,7 +1825,7 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB39_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1857,15 +1835,15 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB39_1;
; SM90-NEXT: $L__BB39_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.gpu;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
ret i8 %new
}
-define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_acquire_i8_global(
+define i8 @acquire_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1873,9 +1851,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1886,7 +1863,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -1894,7 +1871,7 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB40_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1907,12 +1884,12 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
ret i8 %new
}
-define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_acquire_i8_shared(
+define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1920,9 +1897,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1933,15 +1909,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cta_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB41_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1951,15 +1927,15 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB41_1;
; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cta;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
ret i8 %new
}
-define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i8_generic(
+define i8 @acquire_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -1967,9 +1943,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -1980,15 +1955,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_cluster_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
-; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB42_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1998,15 +1973,15 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB42_1;
; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.cluster;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic
ret i8 %new
}
-define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i8_global(
+define i8 @acquire_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -2014,9 +1989,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -2027,7 +2001,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_gpu_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.global.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -2035,7 +2009,7 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB43_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -2045,15 +2019,15 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB43_1;
; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: fence.acquire.gpu;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
ret i8 %new
}
-define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i8_shared(
+define i8 @acquire_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_shared_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
@@ -2061,9 +2035,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
; SM90-NEXT: and.b32 %r10, %r9, 3;
@@ -2074,7 +2047,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_sys_param_1];
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
; SM90-NEXT: and.b32 %r20, %r16, %r2;
@@ -2082,7 +2055,7 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
; SM90-NEXT: or.b32 %r17, %r20, %r3;
; SM90-NEXT: or.b32 %r18, %r20, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB44_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -2095,3586 +2068,20641 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
; SM90-NEXT: fence.acquire.sys;
; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire monotonic
ret i8 %new
}
-define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_monotonic_i16_generic(
+define i8 @acquire_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_shared_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB45_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB45_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB45_1;
; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire monotonic
+ ret i8 %new
}
-define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_monotonic_i16_global(
+define i8 @acquire_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_shared_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB46_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB46_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB46_1;
; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire monotonic
+ ret i8 %new
}
-define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_monotonic_i16_shared(
+define i8 @acquire_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_shared_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB47_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB47_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB47_1;
; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire monotonic
+ ret i8 %new
}
-define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_acquire_i16_generic(
+define i8 @acquire_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_generic_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB48_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB48_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB48_1;
; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end
; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
+ ret i8 %new
}
-define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_acquire_i16_global(
+define i8 @acquire_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_generic_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB49_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB49_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB49_1;
; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
+ ret i8 %new
}
-define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_acquire_i16_shared(
+define i8 @acquire_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_generic_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB50_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB50_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB50_1;
; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire
+ ret i8 %new
}
-define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_seq_cst_i16_generic(
+define i8 @acquire_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB51_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB51_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB51_1;
; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
+ ret i8 %new
}
-define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_seq_cst_i16_global(
+define i8 @acquire_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB52_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB52_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB52_1;
; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end
; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
+ ret i8 %new
}
-define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: monotonic_seq_cst_i16_shared(
+define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB53_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB53_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB53_1;
; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
+ ret i8 %new
}
-define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_monotonic_i16_generic(
+define i8 @acquire_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB54_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB54_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB54_1;
; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire
+ ret i8 %new
}
-define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_monotonic_i16_global(
+define i8 @acquire_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB55_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB55_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB55_1;
; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
+ ret i8 %new
}
-define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_monotonic_i16_shared(
+define i8 @acquire_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_shared_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_sys_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB56_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB56_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB56_1;
; SM90-NEXT: $L__BB56_3: // %partword.cmpxchg.end
; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire acquire
+ ret i8 %new
}
-define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_acquire_i16_generic(
+define i8 @acquire_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_shared_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cta_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB57_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB57_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB57_1;
; SM90-NEXT: $L__BB57_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire acquire
+ ret i8 %new
}
-define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_acquire_i16_global(
+define i8 @acquire_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_shared_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_cluster_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB58_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB58_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB58_1;
; SM90-NEXT: $L__BB58_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire acquire
+ ret i8 %new
}
-define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_acquire_i16_shared(
+define i8 @acquire_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_shared_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_gpu_param_0];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB59_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB59_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB59_1;
; SM90-NEXT: $L__BB59_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire acquire
+ ret i8 %new
}
-define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_seq_cst_i16_generic(
+define i8 @acquire_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_generic_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_sys_param_0];
; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB60_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB60_1;
; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end
; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+ ret i8 %new
}
-define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_seq_cst_i16_global(
+define i8 @acquire_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_generic_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB61_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB61_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB61_1;
; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+ ret i8 %new
}
-define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acquire_seq_cst_i16_shared(
+define i8 @acquire_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_generic_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
-; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB62_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB62_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB62_1;
; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst
+ ret i8 %new
}
-define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_monotonic_i16_generic(
+define i8 @acquire_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
-; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB63_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB63_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB63_1;
; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+ ret i8 %new
}
-define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_monotonic_i16_global(
+define i8 @acquire_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
-; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB64_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB64_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB64_1;
; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+ ret i8 %new
}
-define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_monotonic_i16_shared(
+define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
-; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB65_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB65_1;
; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+ ret i8 %new
}
-define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_acquire_i16_generic(
+define i8 @acquire_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
-; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB66_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB66_1;
; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst
+ ret i8 %new
}
-define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_acquire_i16_global(
+define i8 @acquire_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
-; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB67_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB67_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB67_1;
; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+ ret i8 %new
}
-define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_acquire_i16_shared(
+define i8 @acquire_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_shared_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
-; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB68_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB68_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
-; SM90-NEXT: @%p2 bra $L__BB68_1;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB68_1;
; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end
; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acquire seq_cst
+ ret i8 %new
}
-define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_seq_cst_i16_generic(
+define i8 @acquire_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_shared_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB69_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB69_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB69_1;
; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acquire seq_cst
+ ret i8 %new
}
-define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_seq_cst_i16_global(
+define i8 @acquire_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_shared_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB70_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB70_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB70_1;
; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acquire seq_cst
+ ret i8 %new
}
-define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: release_seq_cst_i16_shared(
+define i8 @acquire_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_shared_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB71_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB71_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB71_1;
; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acquire seq_cst
+ ret i8 %new
}
-define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_monotonic_i16_generic(
+define i8 @release_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_generic_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_sys_param_0];
; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB72_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB72_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB72_1;
; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+ ret i8 %new
}
-define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_monotonic_i16_global(
+define i8 @release_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_generic_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
-; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cta_param_0];
+; SM90-NEXT: fence.release.cta;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB73_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB73_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB73_1;
; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+ ret i8 %new
}
-define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_monotonic_i16_shared(
+define i8 @release_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_generic_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
-; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB74_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB74_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB74_1;
; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic
+ ret i8 %new
}
-define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_acquire_i16_generic(
+define i8 @release_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
-; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
-; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop
-; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB75_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB75_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB75_1;
; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+ ret i8 %new
}
-define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_acquire_i16_global(
+define i8 @release_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_sys_param_0];
; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB76_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB76_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB76_1;
; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+ ret i8 %new
}
-define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_acquire_i16_shared(
+define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
-; SM90-NEXT: fence.release.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB77_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB77_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB77_1;
; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+ ret i8 %new
}
-define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i16_generic(
+define i8 @release_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB78_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB78_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB78_1;
; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic
+ ret i8 %new
}
-define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i16_global(
+define i8 @release_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB79_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB79_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB79_1;
; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+ ret i8 %new
}
-define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i16_shared(
+define i8 @release_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_shared_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_sys_param_0];
+; SM90-NEXT: fence.release.sys;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB80_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB80_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB80_1;
; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release monotonic
+ ret i8 %new
}
-define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_monotonic_i16_generic(
+define i8 @release_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_shared_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cta_param_0];
+; SM90-NEXT: fence.release.cta;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB81_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB81_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB81_1;
; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release monotonic
+ ret i8 %new
}
-define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_monotonic_i16_global(
+define i8 @release_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_shared_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB82_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB82_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB82_1;
; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release monotonic
+ ret i8 %new
}
-define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_monotonic_i16_shared(
+define i8 @release_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_shared_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB83_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB83_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB83_1;
; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release monotonic
+ ret i8 %new
}
-define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_acquire_i16_generic(
+define i8 @release_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_generic_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_sys_param_0];
+; SM90-NEXT: fence.release.sys;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB84_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB84_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB84_1;
; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end
; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release acquire
+ ret i8 %new
}
-define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_acquire_i16_global(
+define i8 @release_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_generic_cta(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cta_param_0];
+; SM90-NEXT: fence.release.cta;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB85_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB85_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB85_1;
; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+ ret i8 %new
}
-define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_acquire_i16_shared(
+define i8 @release_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_generic_cluster(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB86_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB86_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB86_1;
; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire
+ ret i8 %new
}
-define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i16_generic(
+define i8 @release_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB87_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB87_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB87_1;
; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+ ret i8 %new
}
-define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i16_global(
+define i8 @release_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global_sys(
; SM90: {
; SM90-NEXT: .reg .pred %p<3>;
; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b32 %r<21>;
; SM90-NEXT: .reg .b64 %rd<3>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_sys_param_0];
+; SM90-NEXT: fence.release.sys;
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.global.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM90-NEXT: @%p1 bra $L__BB88_3;
; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM90-NEXT: // in Loop: Header=BB88_1 Depth=1
; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
; SM90-NEXT: @%p2 bra $L__BB88_1;
; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end
; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB89_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB89_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB89_1;
+; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB90_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB90_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB90_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB90_1;
+; SM90-NEXT: $L__BB90_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB91_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB91_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB91_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB91_1;
+; SM90-NEXT: $L__BB91_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB92_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB92_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB92_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB92_1;
+; SM90-NEXT: $L__BB92_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB93_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB93_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB93_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB93_1;
+; SM90-NEXT: $L__BB93_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB94_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB94_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB94_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB94_1;
+; SM90-NEXT: $L__BB94_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB95_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB95_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB95_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB95_1;
+; SM90-NEXT: $L__BB95_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release acquire
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB96_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB96_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB96_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB96_1;
+; SM90-NEXT: $L__BB96_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB97_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB97_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB97_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB97_1;
+; SM90-NEXT: $L__BB97_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB98_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB98_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB98_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB98_1;
+; SM90-NEXT: $L__BB98_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB99_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB99_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB99_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB99_1;
+; SM90-NEXT: $L__BB99_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB100_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB100_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB100_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB100_1;
+; SM90-NEXT: $L__BB100_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB101_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB101_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB101_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB101_1;
+; SM90-NEXT: $L__BB101_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB102_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB102_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB102_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB102_1;
+; SM90-NEXT: $L__BB102_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB103_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB103_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB103_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB103_1;
+; SM90-NEXT: $L__BB103_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB104_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB104_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB104_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB104_1;
+; SM90-NEXT: $L__BB104_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB105_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB105_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB105_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB105_1;
+; SM90-NEXT: $L__BB105_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB106_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB106_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB106_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB106_1;
+; SM90-NEXT: $L__BB106_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB107_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB107_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB107_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB107_1;
+; SM90-NEXT: $L__BB107_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") release seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB108_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB108_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB108_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB108_1;
+; SM90-NEXT: $L__BB108_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB109_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB109_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB109_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB109_1;
+; SM90-NEXT: $L__BB109_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB110_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB110_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB110_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB110_1;
+; SM90-NEXT: $L__BB110_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB111_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB111_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB111_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB111_1;
+; SM90-NEXT: $L__BB111_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB112_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB112_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB112_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB112_1;
+; SM90-NEXT: $L__BB112_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB113_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB113_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB113_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB113_1;
+; SM90-NEXT: $L__BB113_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB114_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB114_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB114_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB114_1;
+; SM90-NEXT: $L__BB114_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB115_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB115_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB115_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB115_1;
+; SM90-NEXT: $L__BB115_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB116_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB116_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB116_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB116_1;
+; SM90-NEXT: $L__BB116_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB117_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB117_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB117_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB117_1;
+; SM90-NEXT: $L__BB117_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB118_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB118_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB118_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB118_1;
+; SM90-NEXT: $L__BB118_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB119_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB119_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB119_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB119_1;
+; SM90-NEXT: $L__BB119_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB120_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB120_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB120_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB120_1;
+; SM90-NEXT: $L__BB120_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB121_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB121_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB121_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB121_1;
+; SM90-NEXT: $L__BB121_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB122_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB122_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB122_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB122_1;
+; SM90-NEXT: $L__BB122_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB123_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB123_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB123_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB123_1;
+; SM90-NEXT: $L__BB123_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB124_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB124_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB124_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB124_1;
+; SM90-NEXT: $L__BB124_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB125_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB125_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB125_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB125_1;
+; SM90-NEXT: $L__BB125_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB126_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB126_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB126_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB126_1;
+; SM90-NEXT: $L__BB126_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB127_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB127_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB127_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB127_1;
+; SM90-NEXT: $L__BB127_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB128_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB128_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB128_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB128_1;
+; SM90-NEXT: $L__BB128_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB129_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB129_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB129_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB129_1;
+; SM90-NEXT: $L__BB129_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB130_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB130_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB130_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB130_1;
+; SM90-NEXT: $L__BB130_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB131_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB131_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB131_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB131_1;
+; SM90-NEXT: $L__BB131_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB132_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB132_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB132_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB132_1;
+; SM90-NEXT: $L__BB132_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB133_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB133_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB133_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB133_1;
+; SM90-NEXT: $L__BB133_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB134_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB134_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB134_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB134_1;
+; SM90-NEXT: $L__BB134_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB135_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB135_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB135_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB135_1;
+; SM90-NEXT: $L__BB135_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB136_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB136_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB136_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB136_1;
+; SM90-NEXT: $L__BB136_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB137_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB137_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB137_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB137_1;
+; SM90-NEXT: $L__BB137_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB138_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB138_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB138_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB138_1;
+; SM90-NEXT: $L__BB138_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB139_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB139_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB139_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB139_1;
+; SM90-NEXT: $L__BB139_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB140_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB140_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB140_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB140_1;
+; SM90-NEXT: $L__BB140_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB141_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB141_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB141_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB141_1;
+; SM90-NEXT: $L__BB141_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB142_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB142_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB142_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB142_1;
+; SM90-NEXT: $L__BB142_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB143_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB143_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB143_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB143_1;
+; SM90-NEXT: $L__BB143_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB144_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB144_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB144_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB144_1;
+; SM90-NEXT: $L__BB144_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB145_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB145_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB145_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB145_1;
+; SM90-NEXT: $L__BB145_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB146_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB146_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB146_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB146_1;
+; SM90-NEXT: $L__BB146_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB147_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB147_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB147_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB147_1;
+; SM90-NEXT: $L__BB147_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB148_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB148_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB148_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB148_1;
+; SM90-NEXT: $L__BB148_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB149_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB149_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB149_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB149_1;
+; SM90-NEXT: $L__BB149_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB150_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB150_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB150_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB150_1;
+; SM90-NEXT: $L__BB150_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB151_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB151_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB151_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB151_1;
+; SM90-NEXT: $L__BB151_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB152_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB152_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB152_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB152_1;
+; SM90-NEXT: $L__BB152_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB153_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB153_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB153_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB153_1;
+; SM90-NEXT: $L__BB153_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB154_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB154_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB154_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB154_1;
+; SM90-NEXT: $L__BB154_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB155_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB155_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB155_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB155_1;
+; SM90-NEXT: $L__BB155_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB156_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB156_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB156_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB156_1;
+; SM90-NEXT: $L__BB156_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB157_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB157_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB157_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB157_1;
+; SM90-NEXT: $L__BB157_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB158_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB158_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB158_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB158_1;
+; SM90-NEXT: $L__BB158_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB159_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB159_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB159_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB159_1;
+; SM90-NEXT: $L__BB159_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB160_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB160_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB160_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB160_1;
+; SM90-NEXT: $L__BB160_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB161_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB161_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB161_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB161_1;
+; SM90-NEXT: $L__BB161_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB162_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB162_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB162_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB162_1;
+; SM90-NEXT: $L__BB162_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB163_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB163_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB163_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB163_1;
+; SM90-NEXT: $L__BB163_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB164_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB164_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB164_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB164_1;
+; SM90-NEXT: $L__BB164_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB165_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB165_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB165_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB165_1;
+; SM90-NEXT: $L__BB165_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB166_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB166_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB166_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB166_1;
+; SM90-NEXT: $L__BB166_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB167_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB167_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB167_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB167_1;
+; SM90-NEXT: $L__BB167_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_sys(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB168_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB168_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB168_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB168_1;
+; SM90-NEXT: $L__BB168_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB169_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB169_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB169_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB169_1;
+; SM90-NEXT: $L__BB169_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_cluster(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB170_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB170_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB170_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB170_1;
+; SM90-NEXT: $L__BB170_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic_gpu(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB171_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB171_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB171_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB171_1;
+; SM90-NEXT: $L__BB171_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_sys(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB172_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB172_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB172_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB172_1;
+; SM90-NEXT: $L__BB172_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB173_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB173_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB173_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB173_1;
+; SM90-NEXT: $L__BB173_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_cluster(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB174_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB174_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB174_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB174_1;
+; SM90-NEXT: $L__BB174_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global_gpu(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB175_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB175_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB175_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB175_1;
+; SM90-NEXT: $L__BB175_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_sys(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_sys_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB176_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB176_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB176_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB176_1;
+; SM90-NEXT: $L__BB176_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cta_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB177_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB177_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB177_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB177_1;
+; SM90-NEXT: $L__BB177_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("block") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_cluster(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_cluster_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB178_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB178_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB178_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB178_1;
+; SM90-NEXT: $L__BB178_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("cluster") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared_gpu(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_gpu_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB179_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB179_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB179_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB179_1;
+; SM90-NEXT: $L__BB179_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new syncscope("device") seq_cst seq_cst
+ ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB180_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB180_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB180_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB180_1;
+; SM90-NEXT: $L__BB180_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB181_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB181_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB181_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB181_1;
+; SM90-NEXT: $L__BB181_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB182_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB182_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB182_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB182_1;
+; SM90-NEXT: $L__BB182_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB183_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB183_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB183_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB183_1;
+; SM90-NEXT: $L__BB183_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB184_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB184_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB184_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB184_1;
+; SM90-NEXT: $L__BB184_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB185_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB185_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB185_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB185_1;
+; SM90-NEXT: $L__BB185_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB186_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB186_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB186_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB186_1;
+; SM90-NEXT: $L__BB186_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB187_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB187_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB187_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB187_1;
+; SM90-NEXT: $L__BB187_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB188_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB188_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB188_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB188_1;
+; SM90-NEXT: $L__BB188_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB189_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB189_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB189_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB189_1;
+; SM90-NEXT: $L__BB189_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB190_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB190_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB190_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB190_1;
+; SM90-NEXT: $L__BB190_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB191_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB191_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB191_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB191_1;
+; SM90-NEXT: $L__BB191_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB192_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB192_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB192_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB192_1;
+; SM90-NEXT: $L__BB192_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB193_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB193_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB193_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB193_1;
+; SM90-NEXT: $L__BB193_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB194_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB194_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB194_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB194_1;
+; SM90-NEXT: $L__BB194_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB195_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB195_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB195_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB195_1;
+; SM90-NEXT: $L__BB195_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB196_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB196_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB196_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB196_1;
+; SM90-NEXT: $L__BB196_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB197_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB197_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB197_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB197_1;
+; SM90-NEXT: $L__BB197_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB198_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB198_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB198_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB198_1;
+; SM90-NEXT: $L__BB198_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB199_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB199_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB199_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB199_1;
+; SM90-NEXT: $L__BB199_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB200_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB200_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB200_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB200_1;
+; SM90-NEXT: $L__BB200_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB201_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB201_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB201_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB201_1;
+; SM90-NEXT: $L__BB201_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB202_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB202_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB202_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB202_1;
+; SM90-NEXT: $L__BB202_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB203_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB203_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB203_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB203_1;
+; SM90-NEXT: $L__BB203_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB204_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB204_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB204_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB204_1;
+; SM90-NEXT: $L__BB204_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB205_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB205_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB205_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB205_1;
+; SM90-NEXT: $L__BB205_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB206_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB206_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB206_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB206_1;
+; SM90-NEXT: $L__BB206_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB207_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB207_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB207_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB207_1;
+; SM90-NEXT: $L__BB207_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB208_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB208_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB208_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB208_1;
+; SM90-NEXT: $L__BB208_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB209_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB209_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB209_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB209_1;
+; SM90-NEXT: $L__BB209_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB210_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB210_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB210_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB210_1;
+; SM90-NEXT: $L__BB210_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB211_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB211_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB211_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB211_1;
+; SM90-NEXT: $L__BB211_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB212_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB212_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB212_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB212_1;
+; SM90-NEXT: $L__BB212_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB213_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB213_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB213_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB213_1;
+; SM90-NEXT: $L__BB213_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB214_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB214_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB214_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB214_1;
+; SM90-NEXT: $L__BB214_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB215_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB215_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB215_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB215_1;
+; SM90-NEXT: $L__BB215_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB216_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB216_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB216_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB216_1;
+; SM90-NEXT: $L__BB216_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB217_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB217_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB217_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB217_1;
+; SM90-NEXT: $L__BB217_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB218_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB218_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB218_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB218_1;
+; SM90-NEXT: $L__BB218_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB219_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB219_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB219_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB219_1;
+; SM90-NEXT: $L__BB219_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB220_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB220_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB220_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB220_1;
+; SM90-NEXT: $L__BB220_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB221_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB221_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB221_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB221_1;
+; SM90-NEXT: $L__BB221_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB222_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB222_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB222_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB222_1;
+; SM90-NEXT: $L__BB222_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB223_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB223_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB223_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB223_1;
+; SM90-NEXT: $L__BB223_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB224_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB224_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB224_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB224_1;
+; SM90-NEXT: $L__BB224_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB225_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB225_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB225_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB225_1;
+; SM90-NEXT: $L__BB225_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB226_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB226_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB226_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB226_1;
+; SM90-NEXT: $L__BB226_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB227_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB227_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB227_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB227_1;
+; SM90-NEXT: $L__BB227_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB228_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB228_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB228_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB228_1;
+; SM90-NEXT: $L__BB228_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB229_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB229_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB229_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB229_1;
+; SM90-NEXT: $L__BB229_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB230_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB230_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB230_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB230_1;
+; SM90-NEXT: $L__BB230_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB231_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB231_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB231_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB231_1;
+; SM90-NEXT: $L__BB231_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB232_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB232_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB232_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB232_1;
+; SM90-NEXT: $L__BB232_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB233_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB233_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB233_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB233_1;
+; SM90-NEXT: $L__BB233_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB234_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB234_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB234_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB234_1;
+; SM90-NEXT: $L__BB234_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB235_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB235_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB235_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB235_1;
+; SM90-NEXT: $L__BB235_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_sys_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_sys_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB236_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB236_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB236_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB236_1;
+; SM90-NEXT: $L__BB236_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB237_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB237_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB237_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB237_1;
+; SM90-NEXT: $L__BB237_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_cluster_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_cluster_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB238_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB238_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB238_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB238_1;
+; SM90-NEXT: $L__BB238_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_gpu_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_gpu_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB239_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB239_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB239_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB239_1;
+; SM90-NEXT: $L__BB239_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB240_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB240_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB240_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB240_1;
+; SM90-NEXT: $L__BB240_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB241_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB241_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB241_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB241_1;
+; SM90-NEXT: $L__BB241_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB242_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB242_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB242_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB242_1;
+; SM90-NEXT: $L__BB242_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB243_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB243_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB243_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB243_1;
+; SM90-NEXT: $L__BB243_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB244_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB244_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB244_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB244_1;
+; SM90-NEXT: $L__BB244_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB245_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB245_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB245_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB245_1;
+; SM90-NEXT: $L__BB245_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB246_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB246_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB246_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB246_1;
+; SM90-NEXT: $L__BB246_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB247_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB247_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB247_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB247_1;
+; SM90-NEXT: $L__BB247_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB248_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB248_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB248_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB248_1;
+; SM90-NEXT: $L__BB248_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB249_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB249_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB249_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB249_1;
+; SM90-NEXT: $L__BB249_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB250_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB250_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB250_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB250_1;
+; SM90-NEXT: $L__BB250_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB251_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB251_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB251_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB251_1;
+; SM90-NEXT: $L__BB251_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB252_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB252_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB252_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB252_1;
+; SM90-NEXT: $L__BB252_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB253_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB253_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB253_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB253_1;
+; SM90-NEXT: $L__BB253_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB254_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB254_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB254_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB254_1;
+; SM90-NEXT: $L__BB254_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB255_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB255_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB255_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB255_1;
+; SM90-NEXT: $L__BB255_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB256_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB256_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB256_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB256_1;
+; SM90-NEXT: $L__BB256_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB257_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB257_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB257_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB257_1;
+; SM90-NEXT: $L__BB257_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB258_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB258_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB258_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB258_1;
+; SM90-NEXT: $L__BB258_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB259_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB259_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB259_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB259_1;
+; SM90-NEXT: $L__BB259_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB260_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB260_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB260_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB260_1;
+; SM90-NEXT: $L__BB260_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB261_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB261_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB261_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB261_1;
+; SM90-NEXT: $L__BB261_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB262_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB262_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB262_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB262_1;
+; SM90-NEXT: $L__BB262_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB263_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB263_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB263_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB263_1;
+; SM90-NEXT: $L__BB263_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release monotonic
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB264_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB264_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB264_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB264_1;
+; SM90-NEXT: $L__BB264_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB265_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB265_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB265_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB265_1;
+; SM90-NEXT: $L__BB265_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB266_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB266_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB266_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB266_1;
+; SM90-NEXT: $L__BB266_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB267_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB267_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB267_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB267_1;
+; SM90-NEXT: $L__BB267_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB268_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB268_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB268_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB268_1;
+; SM90-NEXT: $L__BB268_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB269_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB269_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB269_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB269_1;
+; SM90-NEXT: $L__BB269_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB270_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB270_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB270_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB270_1;
+; SM90-NEXT: $L__BB270_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB271_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB271_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB271_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB271_1;
+; SM90-NEXT: $L__BB271_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB272_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB272_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB272_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB272_1;
+; SM90-NEXT: $L__BB272_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB273_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB273_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB273_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB273_1;
+; SM90-NEXT: $L__BB273_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB274_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB274_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB274_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB274_1;
+; SM90-NEXT: $L__BB274_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB275_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB275_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB275_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB275_1;
+; SM90-NEXT: $L__BB275_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release acquire
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB276_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB276_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB276_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB276_1;
+; SM90-NEXT: $L__BB276_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB277_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB277_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB277_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB277_1;
+; SM90-NEXT: $L__BB277_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB278_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB278_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB278_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB278_1;
+; SM90-NEXT: $L__BB278_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB279_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB279_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB279_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB279_1;
+; SM90-NEXT: $L__BB279_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB280_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB280_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB280_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB280_1;
+; SM90-NEXT: $L__BB280_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB281_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB281_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB281_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB281_1;
+; SM90-NEXT: $L__BB281_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB282_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB282_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB282_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB282_1;
+; SM90-NEXT: $L__BB282_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB283_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB283_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB283_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB283_1;
+; SM90-NEXT: $L__BB283_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB284_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB284_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB284_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB284_1;
+; SM90-NEXT: $L__BB284_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB285_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB285_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB285_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB285_1;
+; SM90-NEXT: $L__BB285_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB286_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB286_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB286_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB286_1;
+; SM90-NEXT: $L__BB286_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB287_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB287_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB287_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB287_1;
+; SM90-NEXT: $L__BB287_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") release seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB288_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB288_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB288_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB288_1;
+; SM90-NEXT: $L__BB288_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB289_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB289_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB289_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB289_1;
+; SM90-NEXT: $L__BB289_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB290_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB290_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB290_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB290_1;
+; SM90-NEXT: $L__BB290_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB291_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB291_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB291_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB291_1;
+; SM90-NEXT: $L__BB291_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB292_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB292_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB292_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB292_1;
+; SM90-NEXT: $L__BB292_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB293_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB293_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB293_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB293_1;
+; SM90-NEXT: $L__BB293_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB294_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB294_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB294_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB294_1;
+; SM90-NEXT: $L__BB294_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB295_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB295_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB295_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB295_1;
+; SM90-NEXT: $L__BB295_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB296_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB296_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB296_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB296_1;
+; SM90-NEXT: $L__BB296_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB297_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB297_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB297_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB297_1;
+; SM90-NEXT: $L__BB297_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB298_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB298_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB298_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB298_1;
+; SM90-NEXT: $L__BB298_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB299_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB299_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB299_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB299_1;
+; SM90-NEXT: $L__BB299_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB300_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB300_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB300_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB300_1;
+; SM90-NEXT: $L__BB300_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB301_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB301_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB301_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB301_1;
+; SM90-NEXT: $L__BB301_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB302_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB302_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB302_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB302_1;
+; SM90-NEXT: $L__BB302_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB303_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB303_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB303_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB303_1;
+; SM90-NEXT: $L__BB303_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB304_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB304_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB304_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB304_1;
+; SM90-NEXT: $L__BB304_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB305_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB305_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB305_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB305_1;
+; SM90-NEXT: $L__BB305_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB306_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB306_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB306_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB306_1;
+; SM90-NEXT: $L__BB306_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB307_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB307_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB307_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB307_1;
+; SM90-NEXT: $L__BB307_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_sys_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB308_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB308_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB308_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB308_1;
+; SM90-NEXT: $L__BB308_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB309_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB309_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB309_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB309_1;
+; SM90-NEXT: $L__BB309_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_cluster_param_0];
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB310_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB310_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB310_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB310_1;
+; SM90-NEXT: $L__BB310_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_gpu_param_0];
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB311_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB311_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB311_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB311_1;
+; SM90-NEXT: $L__BB311_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB312_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB312_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB312_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB312_1;
+; SM90-NEXT: $L__BB312_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB313_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB313_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB313_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB313_1;
+; SM90-NEXT: $L__BB313_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB314_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB314_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB314_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB314_1;
+; SM90-NEXT: $L__BB314_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB315_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB315_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB315_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB315_1;
+; SM90-NEXT: $L__BB315_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB316_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB316_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB316_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB316_1;
+; SM90-NEXT: $L__BB316_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB317_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB317_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB317_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB317_1;
+; SM90-NEXT: $L__BB317_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB318_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB318_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB318_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB318_1;
+; SM90-NEXT: $L__BB318_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB319_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB319_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB319_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB319_1;
+; SM90-NEXT: $L__BB319_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB320_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB320_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB320_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB320_1;
+; SM90-NEXT: $L__BB320_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB321_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB321_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB321_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB321_1;
+; SM90-NEXT: $L__BB321_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB322_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB322_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB322_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB322_1;
+; SM90-NEXT: $L__BB322_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB323_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB323_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB323_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB323_1;
+; SM90-NEXT: $L__BB323_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB324_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB324_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB324_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB324_1;
+; SM90-NEXT: $L__BB324_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB325_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB325_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB325_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB325_1;
+; SM90-NEXT: $L__BB325_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB326_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB326_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB326_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB326_1;
+; SM90-NEXT: $L__BB326_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB327_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB327_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB327_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB327_1;
+; SM90-NEXT: $L__BB327_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB328_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB328_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB328_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB328_1;
+; SM90-NEXT: $L__BB328_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB329_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB329_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB329_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB329_1;
+; SM90-NEXT: $L__BB329_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB330_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB330_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB330_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB330_1;
+; SM90-NEXT: $L__BB330_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB331_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB331_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB331_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB331_1;
+; SM90-NEXT: $L__BB331_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB332_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB332_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB332_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB332_1;
+; SM90-NEXT: $L__BB332_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB333_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB333_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB333_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB333_1;
+; SM90-NEXT: $L__BB333_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB334_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB334_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB334_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB334_1;
+; SM90-NEXT: $L__BB334_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB335_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB335_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB335_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB335_1;
+; SM90-NEXT: $L__BB335_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB336_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB336_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB336_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB336_1;
+; SM90-NEXT: $L__BB336_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB337_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB337_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB337_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB337_1;
+; SM90-NEXT: $L__BB337_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB338_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB338_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB338_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB338_1;
+; SM90-NEXT: $L__BB338_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB339_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB339_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB339_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB339_1;
+; SM90-NEXT: $L__BB339_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB340_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB340_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB340_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB340_1;
+; SM90-NEXT: $L__BB340_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB341_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB341_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB341_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB341_1;
+; SM90-NEXT: $L__BB341_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB342_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB342_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB342_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB342_1;
+; SM90-NEXT: $L__BB342_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB343_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB343_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB343_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB343_1;
+; SM90-NEXT: $L__BB343_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB344_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB344_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB344_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB344_1;
+; SM90-NEXT: $L__BB344_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB345_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB345_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB345_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB345_1;
+; SM90-NEXT: $L__BB345_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB346_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB346_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB346_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB346_1;
+; SM90-NEXT: $L__BB346_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB347_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB347_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB347_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB347_1;
+; SM90-NEXT: $L__BB347_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_sys(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB348_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB348_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB348_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB348_1;
+; SM90-NEXT: $L__BB348_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_cta(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB349_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB349_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB349_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB349_1;
+; SM90-NEXT: $L__BB349_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_cluster(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB350_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB350_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB350_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB350_1;
+; SM90-NEXT: $L__BB350_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic_gpu(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB351_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB351_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB351_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB351_1;
+; SM90-NEXT: $L__BB351_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_sys(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB352_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB352_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB352_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB352_1;
+; SM90-NEXT: $L__BB352_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_cta(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB353_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB353_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB353_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB353_1;
+; SM90-NEXT: $L__BB353_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_cluster(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB354_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB354_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB354_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB354_1;
+; SM90-NEXT: $L__BB354_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global_gpu(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB355_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB355_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB355_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB355_1;
+; SM90-NEXT: $L__BB355_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_sys(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_sys_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_sys_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB356_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB356_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB356_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB356_1;
+; SM90-NEXT: $L__BB356_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_cta(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cta_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB357_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB357_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB357_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB357_1;
+; SM90-NEXT: $L__BB357_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("block") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_cluster(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_cluster_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_cluster_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB358_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB358_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB358_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB358_1;
+; SM90-NEXT: $L__BB358_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("cluster") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared_gpu(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_gpu_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_gpu_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB359_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB359_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB359_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB359_1;
+; SM90-NEXT: $L__BB359_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new syncscope("device") seq_cst seq_cst
+ ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_sys_param_2];
+; SM90-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cta_param_2];
+; SM90-NEXT: atom.relaxed.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.relaxed.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.relaxed.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_sys_param_2];
+; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cta_param_2];
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_cluster_param_2];
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_gpu_param_2];
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_sys_param_2];
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cta_param_2];
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_sys_param_2];
+; SM90-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cta_param_2];
+; SM90-NEXT: atom.release.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.release.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.release.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_sys_param_2];
+; SM90-NEXT: atom.release.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cta_param_2];
+; SM90-NEXT: atom.release.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_cluster_param_2];
+; SM90-NEXT: atom.release.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_gpu_param_2];
+; SM90-NEXT: atom.release.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_sys_param_2];
+; SM90-NEXT: atom.release.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cta_param_2];
+; SM90-NEXT: atom.release.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.release.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.release.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release monotonic
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release acquire
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") release seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_sys_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cta_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_sys(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_cta(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_cluster(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic_gpu(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_sys(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_cluster(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global_gpu(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_sys(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_sys_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_cta(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cta_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("block") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_cluster(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("cluster") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared_gpu(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new syncscope("device") seq_cst seq_cst
+ ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_sys_param_2];
+; SM90-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cta_param_2];
+; SM90-NEXT: atom.relaxed.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.relaxed.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.relaxed.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_sys_param_2];
+; SM90-NEXT: atom.relaxed.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cta_param_2];
+; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_cluster_param_2];
+; SM90-NEXT: atom.relaxed.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_gpu_param_2];
+; SM90-NEXT: atom.relaxed.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_sys_param_2];
+; SM90-NEXT: atom.relaxed.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cta_param_2];
+; SM90-NEXT: atom.relaxed.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.relaxed.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.relaxed.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_sys_param_2];
+; SM90-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_generic_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cta_param_2];
+; SM90-NEXT: atom.release.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_generic_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.release.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.release.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_sys_param_2];
+; SM90-NEXT: atom.release.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+ ret i64 %new
}
-define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i16_shared(
+define i64 @release_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global_cta(
; SM90: {
-; SM90-NEXT: .reg .pred %p<3>;
-; SM90-NEXT: .reg .b16 %rs<2>;
-; SM90-NEXT: .reg .b32 %r<20>;
-; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
-; SM90-NEXT: and.b64 %rd1, %rd2, -4;
-; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
-; SM90-NEXT: and.b32 %r11, %r10, 3;
-; SM90-NEXT: shl.b32 %r1, %r11, 3;
-; SM90-NEXT: mov.b32 %r12, 65535;
-; SM90-NEXT: shl.b32 %r13, %r12, %r1;
-; SM90-NEXT: not.b32 %r2, %r13;
-; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
-; SM90-NEXT: shl.b32 %r3, %r14, %r1;
-; SM90-NEXT: shl.b32 %r4, %r9, %r1;
-; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
-; SM90-NEXT: and.b32 %r19, %r15, %r2;
-; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop
-; SM90-NEXT: // =>This Inner Loop Header: Depth=1
-; SM90-NEXT: or.b32 %r16, %r19, %r3;
-; SM90-NEXT: or.b32 %r17, %r19, %r4;
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
-; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
-; SM90-NEXT: @%p1 bra $L__BB89_3;
-; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
-; SM90-NEXT: // in Loop: Header=BB89_1 Depth=1
-; SM90-NEXT: and.b32 %r8, %r7, %r2;
-; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
-; SM90-NEXT: mov.u32 %r19, %r8;
-; SM90-NEXT: @%p2 bra $L__BB89_1;
-; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end
-; SM90-NEXT: fence.acquire.sys;
-; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cta_param_2];
+; SM90-NEXT: atom.release.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
- ret i16 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+ ret i64 %new
}
-define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_monotonic_i32_generic(
+define i64 @release_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
-; SM90-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_cluster_param_2];
+; SM90-NEXT: atom.release.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic
+ ret i64 %new
}
-define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_monotonic_i32_global(
+define i64 @release_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global_gpu(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
-; SM90-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_gpu_param_2];
+; SM90-NEXT: atom.release.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+ ret i64 %new
}
-define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_monotonic_i32_shared(
+define i64 @release_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_shared_sys(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
-; SM90-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_sys_param_2];
+; SM90-NEXT: atom.release.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release monotonic
+ ret i64 %new
}
-define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_acquire_i32_generic(
+define i64 @release_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_shared_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cta_param_2];
+; SM90-NEXT: atom.release.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release monotonic
+ ret i64 %new
}
-define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_acquire_i32_global(
+define i64 @release_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_shared_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.release.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release monotonic
+ ret i64 %new
}
-define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_acquire_i32_shared(
+define i64 @release_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_shared_gpu(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.release.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release monotonic
+ ret i64 %new
}
-define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_seq_cst_i32_generic(
+define i64 @release_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_generic_sys(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release acquire
+ ret i64 %new
}
-define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_seq_cst_i32_global(
+define i64 @release_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_generic_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+ ret i64 %new
}
-define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: monotonic_seq_cst_i32_shared(
+define i64 @release_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_generic_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_generic_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_shared_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_shared_cluster(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_shared_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release acquire
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_generic_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_sys_param_0];
; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+ ret i64 %new
}
-define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_monotonic_i32_generic(
+define i64 @release_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_generic_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+ ret i64 %new
}
-define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_monotonic_i32_global(
+define i64 @release_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_generic_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst
+ ret i64 %new
}
-define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_monotonic_i32_shared(
+define i64 @release_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_generic_gpu(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+ ret i64 %new
}
-define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_acquire_i32_generic(
+define i64 @release_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global_sys(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+ ret i64 %new
}
-define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_acquire_i32_global(
+define i64 @release_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+ ret i64 %new
}
-define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_acquire_i32_shared(
+define i64 @release_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global_gpu(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_shared_sys(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") release seq_cst
+ ret i64 %new
}
-define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_seq_cst_i32_generic(
+define i64 @release_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_shared_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") release seq_cst
+ ret i64 %new
}
-define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_seq_cst_i32_global(
+define i64 @release_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_shared_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") release seq_cst
+ ret i64 %new
}
-define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acquire_seq_cst_i32_shared(
+define i64 @release_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_shared_gpu(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") release seq_cst
+ ret i64 %new
}
-define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_monotonic_i32_generic(
+define i64 @acq_rel_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_generic_sys(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
-; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+ ret i64 %new
}
-define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_monotonic_i32_global(
+define i64 @acq_rel_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_generic_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
-; SM90-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+ ret i64 %new
}
-define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_monotonic_i32_shared(
+define i64 @acq_rel_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_generic_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
-; SM90-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic
+ ret i64 %new
}
-define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_acquire_i32_generic(
+define i64 @acq_rel_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_generic_gpu(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
-; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+ ret i64 %new
}
-define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_acquire_i32_global(
+define i64 @acq_rel_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global_sys(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2];
-; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+ ret i64 %new
}
-define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_acquire_i32_shared(
+define i64 @acq_rel_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
-; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+ ret i64 %new
}
-define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_seq_cst_i32_generic(
+define i64 @acq_rel_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic
+ ret i64 %new
}
-define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_seq_cst_i32_global(
+define i64 @acq_rel_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global_gpu(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+ ret i64 %new
}
-define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: release_seq_cst_i32_shared(
+define i64 @acq_rel_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_shared_sys(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel monotonic
+ ret i64 %new
}
-define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_monotonic_i32_generic(
+define i64 @acq_rel_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_shared_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
-; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel monotonic
+ ret i64 %new
}
-define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_monotonic_i32_global(
+define i64 @acq_rel_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_shared_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
-; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel monotonic
+ ret i64 %new
}
-define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_monotonic_i32_shared(
+define i64 @acq_rel_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_shared_gpu(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
-; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel monotonic
+ ret i64 %new
}
-define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_acquire_i32_generic(
+define i64 @acq_rel_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_generic_sys(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
-; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
+ ret i64 %new
}
-define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_acquire_i32_global(
+define i64 @acq_rel_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_generic_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
-; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
+ ret i64 %new
}
-define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_acquire_i32_shared(
+define i64 @acq_rel_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_generic_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
-; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
-; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire
+ ret i64 %new
}
-define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i32_generic(
+define i64 @acq_rel_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_generic_gpu(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
+ ret i64 %new
}
-define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i32_global(
+define i64 @acq_rel_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global_sys(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
+ ret i64 %new
}
-define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i32_shared(
+define i64 @acq_rel_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
+ ret i64 %new
}
-define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_monotonic_i32_generic(
+define i64 @acq_rel_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire
+ ret i64 %new
}
-define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_monotonic_i32_global(
+define i64 @acq_rel_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global_gpu(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
+ ret i64 %new
}
-define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_monotonic_i32_shared(
+define i64 @acq_rel_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_shared_sys(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_sys_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acq_rel.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel acquire
+ ret i64 %new
}
-define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_acquire_i32_generic(
+define i64 @acq_rel_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_shared_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cta_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acq_rel.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel acquire
+ ret i64 %new
}
-define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_acquire_i32_global(
+define i64 @acq_rel_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_shared_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_cluster_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acq_rel.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel acquire
+ ret i64 %new
}
-define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_acquire_i32_shared(
+define i64 @acq_rel_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_shared_gpu(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_gpu_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acq_rel.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
- ret i32 %new
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel acquire
+ ret i64 %new
}
-define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i32_generic(
+define i64 @acq_rel_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_generic_sys(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_sys_param_0];
; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
+ ret i64 %new
}
-define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i32_global(
+define i64 @acq_rel_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_generic_cta(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
+ ret i64 %new
}
-define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i32_shared(
+define i64 @acq_rel_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_generic_cluster(
; SM90: {
-; SM90-NEXT: .reg .b32 %r<4>;
-; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
-; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
-; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
- ret i32 %new
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst
+ ret i64 %new
}
-define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_monotonic_i64_generic(
+define i64 @acq_rel_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
-; SM90-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
ret i64 %new
}
-define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_monotonic_i64_global(
+define i64 @acq_rel_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global_sys(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
-; SM90-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
ret i64 %new
}
-define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_monotonic_i64_shared(
+define i64 @acq_rel_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global_cta(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
-; SM90-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
ret i64 %new
}
-define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_acquire_i64_generic(
+define i64 @acq_rel_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global_cluster(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst
ret i64 %new
}
-define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_acquire_i64_global(
+define i64 @acq_rel_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
ret i64 %new
}
-define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_acquire_i64_shared(
+define i64 @acq_rel_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_shared_sys(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") acq_rel seq_cst
ret i64 %new
}
-define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_seq_cst_i64_generic(
+define i64 @acq_rel_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_shared_cta(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") acq_rel seq_cst
ret i64 %new
}
-define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_seq_cst_i64_global(
+define i64 @acq_rel_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_shared_cluster(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") acq_rel seq_cst
ret i64 %new
}
-define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: monotonic_seq_cst_i64_shared(
+define i64 @acq_rel_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_shared_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") acq_rel seq_cst
ret i64 %new
}
-define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_monotonic_i64_generic(
+define i64 @seq_cst_monotonic_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_generic_sys(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
ret i64 %new
}
-define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_monotonic_i64_global(
+define i64 @seq_cst_monotonic_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_generic_cta(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
ret i64 %new
}
-define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_monotonic_i64_shared(
+define i64 @seq_cst_monotonic_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_generic_cluster(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic
ret i64 %new
}
-define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_acquire_i64_generic(
+define i64 @seq_cst_monotonic_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
ret i64 %new
}
-define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_acquire_i64_global(
+define i64 @seq_cst_monotonic_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global_sys(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
ret i64 %new
}
-define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_acquire_i64_shared(
+define i64 @seq_cst_monotonic_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global_cta(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
ret i64 %new
}
-define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_seq_cst_i64_generic(
+define i64 @seq_cst_monotonic_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global_cluster(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic
ret i64 %new
}
-define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_seq_cst_i64_global(
+define i64 @seq_cst_monotonic_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
ret i64 %new
}
-define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acquire_seq_cst_i64_shared(
+define i64 @seq_cst_monotonic_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_shared_sys(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_sys_param_0];
; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst monotonic
ret i64 %new
}
-define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_monotonic_i64_generic(
+define i64 @seq_cst_monotonic_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_shared_cta(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
-; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst monotonic
ret i64 %new
}
-define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_monotonic_i64_global(
+define i64 @seq_cst_monotonic_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_shared_cluster(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
-; SM90-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst monotonic
ret i64 %new
}
-define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_monotonic_i64_shared(
+define i64 @seq_cst_monotonic_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_shared_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
-; SM90-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst monotonic
ret i64 %new
}
-define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_acquire_i64_generic(
+define i64 @seq_cst_acquire_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_generic_sys(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
-; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
ret i64 %new
}
-define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_acquire_i64_global(
+define i64 @seq_cst_acquire_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_generic_cta(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
-; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
ret i64 %new
}
-define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_acquire_i64_shared(
+define i64 @seq_cst_acquire_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_generic_cluster(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
-; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire
ret i64 %new
}
-define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_seq_cst_i64_generic(
+define i64 @seq_cst_acquire_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
ret i64 %new
}
-define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_seq_cst_i64_global(
+define i64 @seq_cst_acquire_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global_sys(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_sys_param_0];
; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
ret i64 %new
}
-define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: release_seq_cst_i64_shared(
+define i64 @seq_cst_acquire_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global_cta(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_monotonic_i64_generic(
+define i64 @seq_cst_acquire_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global_cluster(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
-; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_monotonic_i64_global(
+define i64 @seq_cst_acquire_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
-; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_monotonic_i64_shared(
+define i64 @seq_cst_acquire_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_shared_sys(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
-; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_sys_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_acquire_i64_generic(
+define i64 @seq_cst_acquire_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_shared_cta(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
-; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_acquire_i64_global(
+define i64 @seq_cst_acquire_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_shared_cluster(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
-; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_acquire_i64_shared(
+define i64 @seq_cst_acquire_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_shared_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
-; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst acquire
ret i64 %new
}
-define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i64_generic(
+define i64 @seq_cst_seq_cst_i64_generic_sys(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_generic_sys(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_sys_param_0];
; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
ret i64 %new
}
-define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i64_global(
+define i64 @seq_cst_seq_cst_i64_generic_cta(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_generic_cta(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
ret i64 %new
}
-define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: acq_rel_seq_cst_i64_shared(
+define i64 @seq_cst_seq_cst_i64_generic_cluster(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_generic_cluster(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_monotonic_i64_generic(
+define i64 @seq_cst_seq_cst_i64_generic_gpu(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_generic_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_monotonic_i64_global(
+define i64 @seq_cst_seq_cst_i64_global_sys(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global_sys(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_sys_param_0];
; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_monotonic_i64_shared(
+define i64 @seq_cst_seq_cst_i64_global_cta(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global_cta(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_acquire_i64_generic(
+define i64 @seq_cst_seq_cst_i64_global_cluster(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global_cluster(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_acquire_i64_global(
+define i64 @seq_cst_seq_cst_i64_global_gpu(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_acquire_i64_shared(
+define i64 @seq_cst_seq_cst_i64_shared_sys(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_shared_sys(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_sys_param_0];
; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_sys_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_sys_param_2];
+; SM90-NEXT: atom.acquire.sys.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i64_generic(
+define i64 @seq_cst_seq_cst_i64_shared_cta(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_shared_cta(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
-; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cta_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cta_param_2];
+; SM90-NEXT: atom.acquire.cta.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("block") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i64_global(
+define i64 @seq_cst_seq_cst_i64_shared_cluster(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_shared_cluster(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
-; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_cluster_param_0];
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_cluster_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_cluster_param_2];
+; SM90-NEXT: atom.acquire.cluster.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("cluster") seq_cst seq_cst
ret i64 %new
}
-define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
-; SM90-LABEL: seq_cst_seq_cst_i64_shared(
+define i64 @seq_cst_seq_cst_i64_shared_gpu(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_shared_gpu(
; SM90: {
; SM90-NEXT: .reg .b64 %rd<5>;
; SM90-EMPTY:
; SM90-NEXT: // %bb.0:
-; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
-; SM90-NEXT: fence.sc.sys;
-; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
-; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
-; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_gpu_param_0];
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_gpu_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_gpu_param_2];
+; SM90-NEXT: atom.acquire.gpu.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
; SM90-NEXT: ret;
- %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new syncscope("device") seq_cst seq_cst
ret i64 %new
}
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index aaea0d2ee25ef..7767cec9c4fcb 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -79,7 +79,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB0_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -206,7 +206,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB1_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -336,7 +336,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB2_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -466,7 +466,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB3_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -598,7 +598,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r17, %r20, %r3;
; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r18, %r17;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB4_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -726,7 +726,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r16, %r19, %r3;
; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
; SM70-NEXT: @%p1 bra $L__BB5_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -850,7 +850,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r16, %r19, %r3;
; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
; SM70-NEXT: @%p1 bra $L__BB6_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -977,7 +977,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r16, %r19, %r3;
; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
; SM70-NEXT: @%p1 bra $L__BB7_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1104,7 +1104,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r16, %r19, %r3;
; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
; SM70-NEXT: @%p1 bra $L__BB8_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1234,7 +1234,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r16, %r19, %r3;
; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r7, [%rd1], %r17, %r16;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
; SM70-NEXT: @%p1 bra $L__BB9_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
@@ -1316,7 +1316,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
; SM70-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1];
; SM70-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2];
-; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: atom.relaxed.sys.cas.b32 %r3, [%rd1], %r1, %r2;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
; SM90-LABEL: relaxed_sys_i32(
@@ -1358,7 +1358,7 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
; SM70-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
; SM70-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
-; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: atom.acq_rel.sys.cas.b32 %r3, [%rd1], %r1, %r2;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
; SM90-LABEL: acq_rel_sys_i32(
@@ -1400,7 +1400,7 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0];
; SM70-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1];
; SM70-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
; SM90-LABEL: acquire_sys_i32(
@@ -1442,7 +1442,7 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0];
; SM70-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1];
; SM70-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2];
-; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: atom.release.sys.cas.b32 %r3, [%rd1], %r1, %r2;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
; SM90-LABEL: release_sys_i32(
@@ -1486,7 +1486,7 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
; SM70-NEXT: fence.sc.sys;
; SM70-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
; SM70-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
-; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: atom.acquire.sys.cas.b32 %r3, [%rd1], %r1, %r2;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
; SM90-LABEL: seq_cst_sys_i32(
@@ -1529,7 +1529,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
; SM70-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
-; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: atom.relaxed.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
; SM90-LABEL: relaxed_sys_i64(
@@ -1568,7 +1568,7 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0];
; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1];
; SM70-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
; SM90-LABEL: acquire_sys_i64(
@@ -1607,7 +1607,7 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
-; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: atom.acq_rel.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
; SM90-LABEL: acq_rel_sys_i64(
@@ -1646,7 +1646,7 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0];
; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1];
; SM70-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2];
-; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: atom.release.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
; SM90-LABEL: release_sys_i64(
@@ -1687,7 +1687,7 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
; SM70-NEXT: fence.sc.sys;
; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
-; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: atom.acquire.sys.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
; SM90-LABEL: seq_cst_sys_i64(
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py
index ae7450015ecd2..277704bd9d5a5 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.py
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py
@@ -5,8 +5,8 @@
from itertools import product
cmpxchg_func = Template(
- """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
- %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure
+ """define i$size @${success}_${failure}_i${size}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
+ %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new syncscope(\"${llvm_scope}\") $success $failure
ret i$size %new
}
"""
@@ -38,9 +38,12 @@
for sm, ptx in TESTS:
with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp:
print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
- for size, success, failure, addrspace in product(
- SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES
+ for size, success, failure, addrspace, llvm_scope in product(
+ SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES, LLVM_SCOPES
):
+ # cluster ordering is supported from SM90 onwards
+ if sm != 90 and llvm_scope == "cluster":
+ continue
if addrspace == 0:
addrspace_cast = ""
else:
@@ -52,6 +55,8 @@
size=size,
addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
addrspace_cast=addrspace_cast,
+ llvm_scope=llvm_scope,
+ ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
),
file=fp,
)
More information about the llvm-commits
mailing list