[llvm] [NVPTX] Support for memory orderings for cmpxchg (PR #126159)
Akshay Deodhar via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 7 16:56:41 PST 2025
https://github.com/akshayrdeodhar updated https://github.com/llvm/llvm-project/pull/126159
>From 93bff73972f629367f41a8b24216410092233c8f Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Wed, 27 Nov 2024 03:43:49 +0000
Subject: [PATCH 1/5] [NVPTX] Support for memory orderings for cmpxchg
---
llvm/include/llvm/CodeGen/TargetLowering.h | 8 +
llvm/lib/CodeGen/AtomicExpandPass.cpp | 6 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 65 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 11 +
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 139 +-
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 2 +-
llvm/test/CodeGen/NVPTX/atomics.ll | 6 +-
llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 5680 +++++++++++++++++++
llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 5680 +++++++++++++++++++
llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 5680 +++++++++++++++++++
llvm/test/CodeGen/NVPTX/cmpxchg.ll | 1629 +++++-
llvm/test/CodeGen/NVPTX/cmpxchg.py | 46 +
llvm/test/CodeGen/NVPTX/lit.local.cfg | 2 +-
13 files changed, 18815 insertions(+), 139 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
create mode 100644 llvm/test/CodeGen/NVPTX/cmpxchg.py
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index bbecc7a6ddaee79..ac3233ec30ee3ea 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2173,6 +2173,14 @@ class TargetLoweringBase {
return false;
}
+ // The memory ordering that AtomicExpandPass should assign to a atomic
+ // instruction that it has lowered by adding fences. This can be used
+ // to "fold" one of the fences into the atomic instruction.
+ virtual AtomicOrdering
+ atomicOperationOrderAfterFenceSplit(const Instruction *I) const {
+ return AtomicOrdering::Monotonic;
+ }
+
/// Whether AtomicExpandPass should automatically insert a trailing fence
/// without reducing the ordering for this atomic. Defaults to false.
virtual bool
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index a75fa688d87a8dd..a3e9700fa3089c8 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -324,8 +324,10 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
// failure path. As a result, fence insertion is directly done by
// expandAtomicCmpXchg in that case.
FenceOrdering = CASI->getMergedOrdering();
- CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
- CASI->setFailureOrdering(AtomicOrdering::Monotonic);
+ auto CASOrdering = TLI->atomicOperationOrderAfterFenceSplit(CASI);
+
+ CASI->setSuccessOrdering(CASOrdering);
+ CASI->setFailureOrdering(CASOrdering);
}
if (FenceOrdering != AtomicOrdering::Monotonic) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 58ad92a8934a66d..a74eada0afcebab 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -42,6 +42,7 @@
#include "llvm/IR/FPEnv.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
@@ -49,6 +50,7 @@
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Alignment.h"
+#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
@@ -995,6 +997,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// actions
computeRegisterProperties(STI.getRegisterInfo());
+ // PTX support for 16-bit CAS is emulated. Only use 32+
setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
setMaxAtomicSizeInBitsSupported(64);
setMaxDivRemBitWidthSupported(64);
@@ -5565,6 +5568,68 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
return AtomicExpansionKind::CmpXChg;
}
+bool NVPTXTargetLowering::shouldInsertFencesForAtomic(
+ const Instruction *I) const {
+ auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
+ // When CAS bitwidth is not supported on the hardware, the CAS is emulated
+ // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
+ // the memory order using explicit fences around the retry loop.
+ // The memory order of natively supported CAS operations can be enforced
+ // by lowering to an atom.cas with the right memory synchronizing effect.
+ // However, atom.cas only supports relaxed, acquire, release and acq_rel.
+ // So we also use explicit fences for enforcing memory order for
+ // seq_cast CAS with natively-supported bitwidths.
+ return CI &&
+ (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
+ STI.getMinCmpXchgSizeInBits() ||
+ CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
+}
+
+AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
+ const Instruction *I) const {
+ auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
+ bool BitwidthSupportedAndIsSeqCst =
+ CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
+ cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
+ STI.getMinCmpXchgSizeInBits();
+ return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
+ : AtomicOrdering::Monotonic;
+}
+
+Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ // Specialize for cmpxchg
+ if (isa<AtomicCmpXchgInst>(Inst)) {
+ // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
+ if (isReleaseOrStronger(Ord))
+ return Ord == AtomicOrdering::SequentiallyConsistent ?
+ Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) :
+ Builder.CreateFence(AtomicOrdering::Release);
+ } else {
+ return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
+ }
+ return nullptr;
+}
+
+Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
+ Instruction *Inst,
+ AtomicOrdering Ord) const {
+ // Specialize for cmpxchg
+ if (isa<AtomicCmpXchgInst>(Inst)) {
+ auto CASWidth= cast<IntegerType>(dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())->getBitWidth();
+ // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
+ if (isAcquireOrStronger(Ord))
+ return (Ord == AtomicOrdering::SequentiallyConsistent
+ && CASWidth >= STI.getMinCmpXchgSizeInBits()) ?
+ nullptr :
+ Builder.CreateFence(AtomicOrdering::Acquire);
+ } else {
+ return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
+ }
+ return nullptr;
+}
+
// Pin NVPTXTargetObjectFile's vtables to this file.
NVPTXTargetObjectFile::~NVPTXTargetObjectFile() = default;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 5adf69d621552f3..1ee7a9b9ab8e36b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -17,6 +17,7 @@
#include "NVPTX.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Support/AtomicOrdering.h"
namespace llvm {
namespace NVPTXISD {
@@ -260,6 +261,16 @@ class NVPTXTargetLowering : public TargetLowering {
return true;
}
+ bool shouldInsertFencesForAtomic(const Instruction *) const override;
+
+ AtomicOrdering
+ atomicOperationOrderAfterFenceSplit(const Instruction *I) const override;
+
+ Instruction *emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+ Instruction *emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst,
+ AtomicOrdering Ord) const override;
+
private:
const NVPTXSubtarget &STI; // cache the subtarget here
SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 06c629c01d9ab09..14744411bc74bcd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1962,41 +1962,41 @@ multiclass F_ATOMIC_2_NEG<ValueType regT, NVPTXRegClass regclass, string SpaceSt
// has 3 operands
multiclass F_ATOMIC_3_imp<ValueType ptrT, NVPTXRegClass ptrclass,
- ValueType regT, NVPTXRegClass regclass,
+ ValueType regT, NVPTXRegClass regclass, string SemStr,
string SpaceStr, string TypeStr, string OpcStr, PatFrag IntOp,
Operand IMMType, list<Predicate> Pred> {
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
def reg : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, regclass:$b, regclass:$c),
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), (regT regclass:$c)))]>,
Requires<Pred>;
def imm1 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, IMMType:$b, regclass:$c),
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, (regT regclass:$c)))]>,
Requires<Pred>;
def imm2 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, regclass:$b, IMMType:$c),
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;", ""),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), (regT regclass:$b), imm:$c))]>,
Requires<Pred>;
def imm3 : NVPTXInst<(outs regclass:$dst),
(ins ptrclass:$addr, IMMType:$b, IMMType:$c),
- !strconcat("atom", SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
+ !strconcat("atom", SemStr, SpaceStr, OpcStr, TypeStr, " \t$dst, [$addr], $b, $c;"),
[(set (regT regclass:$dst), (IntOp (ptrT ptrclass:$addr), imm:$b, imm:$c))]>,
Requires<Pred>;
}
}
-multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SpaceStr, string TypeStr,
- string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
- defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
- IntOp, IMMType, Pred>;
- defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SpaceStr, TypeStr, OpcStr,
- IntOp, IMMType, Pred>;
+multiclass F_ATOMIC_3<ValueType regT, NVPTXRegClass regclass, string SemStr, string SpaceStr,
+ string TypeStr, string OpcStr, PatFrag IntOp, Operand IMMType, list<Predicate> Pred = []> {
+ defm p32 : F_ATOMIC_3_imp<i32, Int32Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
+ OpcStr, IntOp, IMMType, Pred>;
+ defm p64 : F_ATOMIC_3_imp<i64, Int64Regs, regT, regclass, SemStr, SpaceStr, TypeStr,
+ OpcStr, IntOp, IMMType, Pred>;
}
// atom_add
@@ -2378,51 +2378,86 @@ defm INT_PTX_ATOM_XOR_GEN_64 : F_ATOMIC_2<i64, Int64Regs, "", ".b64", ".xor",
defm INT_PTX_ATOM_XOR_GEN_64_USE_G : F_ATOMIC_2<i64, Int64Regs, ".global", ".b64",
".xor", atomic_load_xor_i64_gen, i64imm, imm, [hasSM<32>]>;
-// atom_cas
-
-def atomic_cmp_swap_i16_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i16_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i16_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i16 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i32_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i32_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i32_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i32 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i64_g: ATOMIC_GLOBAL_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i64_s: ATOMIC_SHARED_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
-def atomic_cmp_swap_i64_gen: ATOMIC_GENERIC_CHK<(ops node:$a, node:$b, node:$c),
- (atomic_cmp_swap_i64 node:$a, node:$b, node:$c)>;
-
-defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
+multiclass ternary_atomic_op_as {
+ // one record per address space
+ def NAME#_gen: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
+ AS_match.generic>;
+
+ def NAME#_g: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
+ AS_match.global>;
+
+ def NAME#_s: PatFrag<(ops node:$ptr, node:$cmp, node:$val),
+ (!cast<SDPatternOperator>(NAME) node:$ptr, node:$cmp, node:$val),
+ AS_match.shared>;
+}
+
+// generate pattern fragments for size x memory order
+// NOTE: i8 cmpxchg is not supported in ptx, and AtomicExpandPass will emulate all i8 cmpxchgs
+// using larger-bitwidth cas
+foreach size = ["i16", "i32", "i64"] in {
+ foreach order = ["", "_monotonic", "_acquire", "_release", "_acq_rel", "_seq_cst"] in {
+ defm atomic_cmp_swap#_#size#order: ternary_atomic_op_as;
+ }
+}
+
+multiclass INT_PTX_ATOM_CAS_AS<string atomic_cmp_swap_without_as, string type, string order> {
+ // eg. with type = 32, order = .acquire, atomic_cmp_swap_without_as = atomic_cmp_swap_i32_acquire, and _GEN becomes
+ // F_ATOMIC_3<i32, Int32Regs, ".acquire", ".b32", ".cas", atomic_cmp_swap_i32_acquire, i32imm>
+
+
+ // For SM70, PTX63 +, memory orders are supported
+ defm _GEN: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, "", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+ defm _GEN_USE_G: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+ defm _G: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_g), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+ defm _S: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".shared", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_s), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+
+ // For older archs, we fall back to lowering to relaxed atom.cas
+ defm _GEN_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", "", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm")>;
+
+ defm _GEN_USE_G_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm")>;
+
+ defm _G_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_g), !cast<Operand>("i"#type#"imm")>;
+
+ defm _S_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".shared", ".b"#type, ".cas",
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_s), !cast<Operand>("i"#type#"imm")>;
+}
+
+// Define atom.cas for all combinations of size x memory order supported in PTX *and* on the hardware.
+foreach size = ["32", "64"] in {
+ // We enumerate the instructions with specific memory orders ["acquire", "release", "acq_rel"]
+ // *before* the instructions with the unspecified memory order [""].
+ // This ensures that `cmpxchg ... acquire` is lowered to "atom.acquire.cas" instead of "atom.cas"
+ // Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
+ foreach order = ["acquire", "release", "acq_rel", ""] in {
+ if !eq(order, "") then {
+ defm INT_PTX_ATOM_CAS_#size : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, ".relaxed">;
+ } else {
+ defm INT_PTX_ATOM_CAS_#size#_#order : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "."#order>;
+ }
+ }
+}
+
+// Note that 16-bit CAS support in PTX is *emulated*. (TODO: public?)
+defm INT_PTX_ATOM_CAS_G_16 : F_ATOMIC_3<i16, Int16Regs, "", ".global", ".b16", ".cas",
atomic_cmp_swap_i16_g, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, ".shared", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_S_16 : F_ATOMIC_3<i16, Int16Regs, "", ".shared", ".b16", ".cas",
atomic_cmp_swap_i16_s, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_GEN_16 : F_ATOMIC_3<i16, Int16Regs, "", "", ".b16", ".cas",
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, ".global", ".b16", ".cas",
+defm INT_PTX_ATOM_CAS_GEN_16_USE_G : F_ATOMIC_3<i16, Int16Regs, "", ".global", ".b16", ".cas",
atomic_cmp_swap_i16_gen, i16imm, [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_CAS_G_32 : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32", ".cas",
- atomic_cmp_swap_i32_g, i32imm>;
-defm INT_PTX_ATOM_CAS_S_32 : F_ATOMIC_3<i32, Int32Regs, ".shared", ".b32", ".cas",
- atomic_cmp_swap_i32_s, i32imm>;
-defm INT_PTX_ATOM_CAS_GEN_32 : F_ATOMIC_3<i32, Int32Regs, "", ".b32", ".cas",
- atomic_cmp_swap_i32_gen, i32imm>;
-defm INT_PTX_ATOM_CAS_GEN_32_USE_G : F_ATOMIC_3<i32, Int32Regs, ".global", ".b32",
- ".cas", atomic_cmp_swap_i32_gen, i32imm>;
-defm INT_PTX_ATOM_CAS_G_64 : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64", ".cas",
- atomic_cmp_swap_i64_g, i64imm>;
-defm INT_PTX_ATOM_CAS_S_64 : F_ATOMIC_3<i64, Int64Regs, ".shared", ".b64", ".cas",
- atomic_cmp_swap_i64_s, i64imm>;
-defm INT_PTX_ATOM_CAS_GEN_64 : F_ATOMIC_3<i64, Int64Regs, "", ".b64", ".cas",
- atomic_cmp_swap_i64_gen, i64imm>;
-defm INT_PTX_ATOM_CAS_GEN_64_USE_G : F_ATOMIC_3<i64, Int64Regs, ".global", ".b64",
- ".cas", atomic_cmp_swap_i64_gen, i64imm>;
// Support for scoped atomic operations. Matches
// int_nvvm_atomic_{op}_{space}_{type}_{scope}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 851c9152e4cb8ff..f893f1aefec8485 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -147,7 +147,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// set of equivalent memory operations with a scalar data-type, executed in
// an unspecified order on the elements in the vector.
unsigned getMaxRequiredAlignment() const { return 8; }
- // Emulated loops with 32-bit/64-bit CAS generate better SASS than 16-bit CAS
+ // Get the smallest cmpxchg word size that the hardware supports.
unsigned getMinCmpXchgSizeInBits() const { return 32; }
unsigned getPTXVersion() const { return PTXVersion; }
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index fccc3a8844ffc4f..ac2bd8828b20579 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -177,7 +177,7 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
; CHECK-LABEL: atomicrmw_add_f16_generic
define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
-; CHECK: atom.cas
+; CHECK: atom.acquire.cas
%ret = atomicrmw fadd ptr %addr, half %val seq_cst
ret half %ret
}
@@ -198,14 +198,14 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
; CHECK-LABEL: atomic_cmpxchg_i32
define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) {
-; CHECK: atom.cas.b32
+; CHECK: atom.acquire.cas.b32
%pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
ret i32 %new
}
; CHECK-LABEL: atomic_cmpxchg_i64
define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) {
-; CHECK: atom.cas.b64
+; CHECK: atom.acquire.cas.b64
%pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
ret i64 %new
}
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
new file mode 100644
index 000000000000000..ea308c2a7673b7b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll
@@ -0,0 +1,5680 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %}
+
+define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB0_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB0_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB0_1;
+; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+ ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB1_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB1_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB1_1;
+; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+ ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_monotonic_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB2_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB2_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB2_1;
+; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+ ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB3_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB3_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB3_1;
+; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+ ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB4_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB4_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB4_1;
+; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+ ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_acquire_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB5_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB5_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB5_1;
+; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+ ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB6_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB6_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB6_1;
+; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+ ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB7_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB7_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB7_1;
+; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: monotonic_seq_cst_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB8_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB8_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB8_1;
+; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB9_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB9_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB9_1;
+; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+ ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB10_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB10_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB10_1;
+; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+ ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_monotonic_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB11_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB11_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB11_1;
+; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+ ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB12_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB12_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB12_1;
+; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+ ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB13_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB13_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB13_1;
+; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+ ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_acquire_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB14_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB14_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB14_1;
+; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+ ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB15_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB15_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB15_1;
+; SM60-NEXT: $L__BB15_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+ ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB16_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB16_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB16_1;
+; SM60-NEXT: $L__BB16_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+ ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acquire_seq_cst_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB17_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB17_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB17_1;
+; SM60-NEXT: $L__BB17_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+ ret i8 %new
+}
+
+define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB18_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB18_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB18_1;
+; SM60-NEXT: $L__BB18_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+ ret i8 %new
+}
+
+define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB19_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB19_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB19_1;
+; SM60-NEXT: $L__BB19_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+ ret i8 %new
+}
+
+define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_monotonic_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB20_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB20_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB20_1;
+; SM60-NEXT: $L__BB20_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB21_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB21_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB21_1;
+; SM60-NEXT: $L__BB21_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB22_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB22_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB22_1;
+; SM60-NEXT: $L__BB22_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_acquire_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB23_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB23_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB23_1;
+; SM60-NEXT: $L__BB23_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB24_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB24_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB24_1;
+; SM60-NEXT: $L__BB24_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB25_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB25_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB25_1;
+; SM60-NEXT: $L__BB25_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: release_seq_cst_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB26_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB26_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB26_1;
+; SM60-NEXT: $L__BB26_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB27_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB27_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB27_1;
+; SM60-NEXT: $L__BB27_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB28_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB28_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB28_1;
+; SM60-NEXT: $L__BB28_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_monotonic_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB29_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB29_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB29_1;
+; SM60-NEXT: $L__BB29_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB30_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB30_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB30_1;
+; SM60-NEXT: $L__BB30_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB31_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB31_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB31_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB31_1;
+; SM60-NEXT: $L__BB31_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_acquire_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB32_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB32_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB32_1;
+; SM60-NEXT: $L__BB32_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB33_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB33_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB33_1;
+; SM60-NEXT: $L__BB33_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB34_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB34_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB34_1;
+; SM60-NEXT: $L__BB34_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB35_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB35_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB35_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB35_1;
+; SM60-NEXT: $L__BB35_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB36_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB36_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB36_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB36_1;
+; SM60-NEXT: $L__BB36_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB37_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB37_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB37_1;
+; SM60-NEXT: $L__BB37_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_monotonic_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB38_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB38_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB38_1;
+; SM60-NEXT: $L__BB38_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB39_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB39_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB39_1;
+; SM60-NEXT: $L__BB39_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB40_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB40_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB40_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB40_1;
+; SM60-NEXT: $L__BB40_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_acquire_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB41_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB41_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB41_1;
+; SM60-NEXT: $L__BB41_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB42_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB42_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB42_1;
+; SM60-NEXT: $L__BB42_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.global.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB43_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB43_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB43_1;
+; SM60-NEXT: $L__BB43_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i8_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<21>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM60-NEXT: and.b32 %r10, %r9, 3;
+; SM60-NEXT: shl.b32 %r1, %r10, 3;
+; SM60-NEXT: mov.b32 %r11, 255;
+; SM60-NEXT: shl.b32 %r12, %r11, %r1;
+; SM60-NEXT: not.b32 %r2, %r12;
+; SM60-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM60-NEXT: and.b32 %r14, %r13, 255;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM60-NEXT: shl.b32 %r4, %r15, %r1;
+; SM60-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM60-NEXT: and.b32 %r20, %r16, %r2;
+; SM60-NEXT: $L__BB44_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r17, %r20, %r3;
+; SM60-NEXT: or.b32 %r18, %r20, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM60-NEXT: @%p1 bra $L__BB44_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB44_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM60-NEXT: mov.u32 %r20, %r8;
+; SM60-NEXT: @%p2 bra $L__BB44_1;
+; SM60-NEXT: $L__BB44_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB45_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB45_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB45_1;
+; SM60-NEXT: $L__BB45_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB46_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB46_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB46_1;
+; SM60-NEXT: $L__BB46_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_monotonic_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB47_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB47_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB47_1;
+; SM60-NEXT: $L__BB47_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB48_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB48_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB48_1;
+; SM60-NEXT: $L__BB48_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB49_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB49_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB49_1;
+; SM60-NEXT: $L__BB49_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_acquire_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB50_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB50_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB50_1;
+; SM60-NEXT: $L__BB50_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB51_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB51_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB51_1;
+; SM60-NEXT: $L__BB51_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB52_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB52_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB52_1;
+; SM60-NEXT: $L__BB52_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: monotonic_seq_cst_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB53_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB53_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB53_1;
+; SM60-NEXT: $L__BB53_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB54_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB54_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB54_1;
+; SM60-NEXT: $L__BB54_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB55_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB55_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB55_1;
+; SM60-NEXT: $L__BB55_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_monotonic_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB56_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB56_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB56_1;
+; SM60-NEXT: $L__BB56_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB57_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB57_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB57_1;
+; SM60-NEXT: $L__BB57_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB58_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB58_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB58_1;
+; SM60-NEXT: $L__BB58_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_acquire_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB59_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB59_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB59_1;
+; SM60-NEXT: $L__BB59_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB60_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB60_1;
+; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB61_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB61_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB61_1;
+; SM60-NEXT: $L__BB61_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acquire_seq_cst_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB62_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB62_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB62_1;
+; SM60-NEXT: $L__BB62_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB63_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB63_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB63_1;
+; SM60-NEXT: $L__BB63_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB64_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB64_1;
+; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_monotonic_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB65_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB65_1;
+; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB66_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB66_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB66_1;
+; SM60-NEXT: $L__BB66_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB67_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB67_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB67_1;
+; SM60-NEXT: $L__BB67_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_acquire_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB68_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB68_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB68_1;
+; SM60-NEXT: $L__BB68_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB69_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB69_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB69_1;
+; SM60-NEXT: $L__BB69_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB70_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB70_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB70_1;
+; SM60-NEXT: $L__BB70_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: release_seq_cst_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB71_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB71_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB71_1;
+; SM60-NEXT: $L__BB71_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB72_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB72_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB72_1;
+; SM60-NEXT: $L__BB72_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB73_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB73_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB73_1;
+; SM60-NEXT: $L__BB73_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_monotonic_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB74_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB74_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB74_1;
+; SM60-NEXT: $L__BB74_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB75_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB75_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB75_1;
+; SM60-NEXT: $L__BB75_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB76_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB76_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB76_1;
+; SM60-NEXT: $L__BB76_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_acquire_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB77_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB77_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB77_1;
+; SM60-NEXT: $L__BB77_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB78_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB78_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB78_1;
+; SM60-NEXT: $L__BB78_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB79_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB79_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB79_1;
+; SM60-NEXT: $L__BB79_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB80_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB80_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB80_1;
+; SM60-NEXT: $L__BB80_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB81_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB81_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB81_1;
+; SM60-NEXT: $L__BB81_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB82_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB82_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB82_1;
+; SM60-NEXT: $L__BB82_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_monotonic_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB83_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB83_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB83_1;
+; SM60-NEXT: $L__BB83_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB84_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB84_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB84_1;
+; SM60-NEXT: $L__BB84_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB85_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB85_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB85_1;
+; SM60-NEXT: $L__BB85_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_acquire_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB86_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB86_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB86_1;
+; SM60-NEXT: $L__BB86_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_generic(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB87_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB87_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB87_1;
+; SM60-NEXT: $L__BB87_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_global(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.global.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB88_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB88_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB88_1;
+; SM60-NEXT: $L__BB88_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i16_shared(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM60-NEXT: and.b32 %r11, %r10, 3;
+; SM60-NEXT: shl.b32 %r1, %r11, 3;
+; SM60-NEXT: mov.b32 %r12, 65535;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: not.b32 %r2, %r13;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM60-NEXT: shl.b32 %r3, %r14, %r1;
+; SM60-NEXT: shl.b32 %r4, %r9, %r1;
+; SM60-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM60-NEXT: and.b32 %r19, %r15, %r2;
+; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: or.b32 %r16, %r19, %r3;
+; SM60-NEXT: or.b32 %r17, %r19, %r4;
+; SM60-NEXT: atom.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM60-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM60-NEXT: @%p1 bra $L__BB89_3;
+; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM60-NEXT: // in Loop: Header=BB89_1 Depth=1
+; SM60-NEXT: and.b32 %r8, %r7, %r2;
+; SM60-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM60-NEXT: mov.u32 %r19, %r8;
+; SM60-NEXT: @%p2 bra $L__BB89_1;
+; SM60-NEXT: $L__BB89_3: // %partword.cmpxchg.end
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+ ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_monotonic_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_acquire_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: monotonic_seq_cst_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_monotonic_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_acquire_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acquire_seq_cst_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_monotonic_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_acquire_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: release_seq_cst_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_monotonic_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_acquire_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_monotonic_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_acquire_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_generic(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
+; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_global(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
+; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i32_shared(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
+; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_monotonic_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_acquire_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: monotonic_seq_cst_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_monotonic_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_acquire_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acquire_seq_cst_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_monotonic_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_acquire_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: release_seq_cst_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+ ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+ ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_monotonic_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+ ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+ ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_acquire_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+ ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: acq_rel_seq_cst_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+ ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_monotonic_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+ ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+ ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_acquire_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+ ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_generic(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
+; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_global(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
+; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM60-LABEL: seq_cst_seq_cst_i64_shared(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
+; SM60-NEXT: membar.sys;
+; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
+; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
+; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ ret i64 %new
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
new file mode 100644
index 000000000000000..4360ea36e863ab2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll
@@ -0,0 +1,5680 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+
+define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB0_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB0_1;
+; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+ ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB1_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB1_1;
+; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+ ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_monotonic_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB2_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB2_1;
+; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+ ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB3_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB3_1;
+; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+ ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB4_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB4_1;
+; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+ ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_acquire_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB5_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB5_1;
+; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+ ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB6_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB6_1;
+; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+ ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB7_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB7_1;
+; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: monotonic_seq_cst_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB8_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB8_1;
+; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB9_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB9_1;
+; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+ ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB10_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB10_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB10_1;
+; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+ ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_monotonic_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB11_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB11_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB11_1;
+; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+ ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB12_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB12_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB12_1;
+; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+ ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB13_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB13_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB13_1;
+; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+ ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_acquire_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB14_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB14_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB14_1;
+; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+ ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB15_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB15_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB15_1;
+; SM70-NEXT: $L__BB15_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+ ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB16_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB16_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB16_1;
+; SM70-NEXT: $L__BB16_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+ ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acquire_seq_cst_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB17_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB17_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB17_1;
+; SM70-NEXT: $L__BB17_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+ ret i8 %new
+}
+
+define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB18_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB18_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB18_1;
+; SM70-NEXT: $L__BB18_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+ ret i8 %new
+}
+
+define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB19_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB19_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB19_1;
+; SM70-NEXT: $L__BB19_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+ ret i8 %new
+}
+
+define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_monotonic_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB20_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB20_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB20_1;
+; SM70-NEXT: $L__BB20_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB21_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB21_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB21_1;
+; SM70-NEXT: $L__BB21_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB22_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB22_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB22_1;
+; SM70-NEXT: $L__BB22_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_acquire_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB23_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB23_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB23_1;
+; SM70-NEXT: $L__BB23_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB24_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB24_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB24_1;
+; SM70-NEXT: $L__BB24_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB25_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB25_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB25_1;
+; SM70-NEXT: $L__BB25_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: release_seq_cst_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB26_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB26_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB26_1;
+; SM70-NEXT: $L__BB26_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB27_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB27_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB27_1;
+; SM70-NEXT: $L__BB27_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB28_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB28_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB28_1;
+; SM70-NEXT: $L__BB28_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_monotonic_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB29_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB29_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB29_1;
+; SM70-NEXT: $L__BB29_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB30_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB30_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB30_1;
+; SM70-NEXT: $L__BB30_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB31_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB31_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB31_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB31_1;
+; SM70-NEXT: $L__BB31_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_acquire_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB32_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB32_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB32_1;
+; SM70-NEXT: $L__BB32_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB33_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB33_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB33_1;
+; SM70-NEXT: $L__BB33_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB34_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB34_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB34_1;
+; SM70-NEXT: $L__BB34_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB35_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB35_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB35_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB35_1;
+; SM70-NEXT: $L__BB35_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB36_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB36_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB36_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB36_1;
+; SM70-NEXT: $L__BB36_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB37_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB37_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB37_1;
+; SM70-NEXT: $L__BB37_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_monotonic_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB38_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB38_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB38_1;
+; SM70-NEXT: $L__BB38_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB39_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB39_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB39_1;
+; SM70-NEXT: $L__BB39_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB40_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB40_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB40_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB40_1;
+; SM70-NEXT: $L__BB40_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_acquire_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB41_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB41_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB41_1;
+; SM70-NEXT: $L__BB41_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB42_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB42_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB42_1;
+; SM70-NEXT: $L__BB42_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.global.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB43_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB43_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB43_1;
+; SM70-NEXT: $L__BB43_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i8_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB44_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB44_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB44_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB44_1;
+; SM70-NEXT: $L__BB44_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB45_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB45_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB45_1;
+; SM70-NEXT: $L__BB45_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB46_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB46_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB46_1;
+; SM70-NEXT: $L__BB46_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_monotonic_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB47_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB47_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB47_1;
+; SM70-NEXT: $L__BB47_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB48_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB48_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB48_1;
+; SM70-NEXT: $L__BB48_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB49_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB49_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB49_1;
+; SM70-NEXT: $L__BB49_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_acquire_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB50_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB50_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB50_1;
+; SM70-NEXT: $L__BB50_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB51_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB51_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB51_1;
+; SM70-NEXT: $L__BB51_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB52_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB52_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB52_1;
+; SM70-NEXT: $L__BB52_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: monotonic_seq_cst_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB53_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB53_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB53_1;
+; SM70-NEXT: $L__BB53_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB54_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB54_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB54_1;
+; SM70-NEXT: $L__BB54_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB55_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB55_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB55_1;
+; SM70-NEXT: $L__BB55_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_monotonic_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB56_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB56_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB56_1;
+; SM70-NEXT: $L__BB56_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB57_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB57_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB57_1;
+; SM70-NEXT: $L__BB57_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB58_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB58_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB58_1;
+; SM70-NEXT: $L__BB58_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_acquire_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB59_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB59_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB59_1;
+; SM70-NEXT: $L__BB59_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB60_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB60_1;
+; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB61_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB61_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB61_1;
+; SM70-NEXT: $L__BB61_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acquire_seq_cst_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB62_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB62_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB62_1;
+; SM70-NEXT: $L__BB62_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB63_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB63_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB63_1;
+; SM70-NEXT: $L__BB63_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB64_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB64_1;
+; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_monotonic_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB65_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB65_1;
+; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB66_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB66_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB66_1;
+; SM70-NEXT: $L__BB66_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB67_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB67_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB67_1;
+; SM70-NEXT: $L__BB67_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_acquire_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB68_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB68_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB68_1;
+; SM70-NEXT: $L__BB68_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB69_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB69_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB69_1;
+; SM70-NEXT: $L__BB69_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB70_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB70_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB70_1;
+; SM70-NEXT: $L__BB70_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: release_seq_cst_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB71_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB71_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB71_1;
+; SM70-NEXT: $L__BB71_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB72_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB72_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB72_1;
+; SM70-NEXT: $L__BB72_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB73_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB73_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB73_1;
+; SM70-NEXT: $L__BB73_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_monotonic_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB74_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB74_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB74_1;
+; SM70-NEXT: $L__BB74_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB75_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB75_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB75_1;
+; SM70-NEXT: $L__BB75_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB76_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB76_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB76_1;
+; SM70-NEXT: $L__BB76_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_acquire_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB77_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB77_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB77_1;
+; SM70-NEXT: $L__BB77_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB78_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB78_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB78_1;
+; SM70-NEXT: $L__BB78_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB79_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB79_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB79_1;
+; SM70-NEXT: $L__BB79_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB80_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB80_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB80_1;
+; SM70-NEXT: $L__BB80_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB81_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB81_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB81_1;
+; SM70-NEXT: $L__BB81_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB82_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB82_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB82_1;
+; SM70-NEXT: $L__BB82_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_monotonic_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB83_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB83_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB83_1;
+; SM70-NEXT: $L__BB83_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB84_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB84_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB84_1;
+; SM70-NEXT: $L__BB84_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB85_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB85_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB85_1;
+; SM70-NEXT: $L__BB85_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_acquire_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB86_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB86_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB86_1;
+; SM70-NEXT: $L__BB86_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_generic(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB87_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB87_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB87_1;
+; SM70-NEXT: $L__BB87_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_global(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.global.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB88_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB88_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB88_1;
+; SM70-NEXT: $L__BB88_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i16_shared(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB89_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB89_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB89_1;
+; SM70-NEXT: $L__BB89_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+ ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
+; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
+; SM70-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_monotonic_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
+; SM70-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_acquire_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: monotonic_seq_cst_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_monotonic_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_acquire_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acquire_seq_cst_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
+; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
+; SM70-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_monotonic_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
+; SM70-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
+; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2];
+; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_acquire_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
+; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: release_seq_cst_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
+; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
+; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_monotonic_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
+; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
+; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_acquire_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
+; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_monotonic_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_acquire_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_generic(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_global(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i32_shared(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
+; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
+; SM70-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_monotonic_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
+; SM70-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_acquire_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: monotonic_seq_cst_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_monotonic_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_acquire_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acquire_seq_cst_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
+; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
+; SM70-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_monotonic_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
+; SM70-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
+; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
+; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_acquire_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
+; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: release_seq_cst_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+ ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
+; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+ ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
+; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_monotonic_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
+; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
+; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+ ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
+; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+ ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_acquire_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
+; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+ ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: acq_rel_seq_cst_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+ ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_monotonic_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+ ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+ ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_acquire_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+ ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_generic(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_global(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
+; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM70-LABEL: seq_cst_seq_cst_i64_shared(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
+; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ ret i64 %new
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
new file mode 100644
index 000000000000000..5acb275a6f5810b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll
@@ -0,0 +1,5680 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
+
+define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB0_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB0_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB0_1;
+; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+ ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB1_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB1_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB1_1;
+; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic monotonic
+ ret i8 %new
+}
+
+define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_monotonic_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB2_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB2_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB2_1;
+; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic monotonic
+ ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB3_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB3_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB3_1;
+; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic acquire
+ ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB4_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB4_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB4_1;
+; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic acquire
+ ret i8 %new
+}
+
+define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_acquire_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB5_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB5_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB5_1;
+; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic acquire
+ ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB6_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB6_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB6_1;
+; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic seq_cst
+ ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB7_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB7_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB7_1;
+; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ ret i8 %new
+}
+
+define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: monotonic_seq_cst_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB8_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB8_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB8_1;
+; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new monotonic seq_cst
+ ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB9_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB9_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB9_1;
+; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire monotonic
+ ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB10_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB10_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB10_1;
+; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire monotonic
+ ret i8 %new
+}
+
+define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_monotonic_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB11_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB11_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB11_1;
+; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire monotonic
+ ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB12_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB12_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB12_1;
+; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+ ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB13_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB13_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB13_1;
+; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire acquire
+ ret i8 %new
+}
+
+define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_acquire_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB14_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB14_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB14_1;
+; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire acquire
+ ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB15_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB15_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB15_1;
+; SM90-NEXT: $L__BB15_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire seq_cst
+ ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB16_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB16_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB16_1;
+; SM90-NEXT: $L__BB16_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acquire seq_cst
+ ret i8 %new
+}
+
+define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acquire_seq_cst_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB17_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB17_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB17_1;
+; SM90-NEXT: $L__BB17_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acquire seq_cst
+ ret i8 %new
+}
+
+define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB18_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB18_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB18_1;
+; SM90-NEXT: $L__BB18_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+ ret i8 %new
+}
+
+define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB19_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB19_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB19_1;
+; SM90-NEXT: $L__BB19_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release monotonic
+ ret i8 %new
+}
+
+define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_monotonic_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB20_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB20_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB20_1;
+; SM90-NEXT: $L__BB20_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release monotonic
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB21_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB21_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB21_1;
+; SM90-NEXT: $L__BB21_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB22_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB22_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB22_1;
+; SM90-NEXT: $L__BB22_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release acquire
+ ret i8 %new
+}
+
+define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_acquire_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB23_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB23_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB23_1;
+; SM90-NEXT: $L__BB23_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release acquire
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB24_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB24_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB24_1;
+; SM90-NEXT: $L__BB24_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB25_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB25_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB25_1;
+; SM90-NEXT: $L__BB25_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new release seq_cst
+ ret i8 %new
+}
+
+define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: release_seq_cst_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB26_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB26_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB26_1;
+; SM90-NEXT: $L__BB26_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new release seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB27_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB27_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB27_1;
+; SM90-NEXT: $L__BB27_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB28_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB28_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB28_1;
+; SM90-NEXT: $L__BB28_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_monotonic_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB29_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB29_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB29_1;
+; SM90-NEXT: $L__BB29_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB30_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB30_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB30_1;
+; SM90-NEXT: $L__BB30_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB31_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB31_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB31_1;
+; SM90-NEXT: $L__BB31_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_acquire_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB32_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB32_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB32_1;
+; SM90-NEXT: $L__BB32_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB33_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB33_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB33_1;
+; SM90-NEXT: $L__BB33_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB34_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB34_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB34_1;
+; SM90-NEXT: $L__BB34_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB35_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB35_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB35_1;
+; SM90-NEXT: $L__BB35_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new acq_rel seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB36_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB36_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB36_1;
+; SM90-NEXT: $L__BB36_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB37_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB37_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB37_1;
+; SM90-NEXT: $L__BB37_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_monotonic_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB38_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB38_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB38_1;
+; SM90-NEXT: $L__BB38_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst monotonic
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB39_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB39_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB39_1;
+; SM90-NEXT: $L__BB39_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB40_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB40_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB40_1;
+; SM90-NEXT: $L__BB40_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_acquire_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB41_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB41_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB41_1;
+; SM90-NEXT: $L__BB41_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB42_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB42_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB42_1;
+; SM90-NEXT: $L__BB42_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.global.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB43_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB43_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB43_1;
+; SM90-NEXT: $L__BB43_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ ret i8 %new
+}
+
+define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i8_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.shared.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB44_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB44_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB44_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB44_1;
+; SM90-NEXT: $L__BB44_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ ret i8 %new
+}
+
+define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB45_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB45_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB45_1;
+; SM90-NEXT: $L__BB45_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB46_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB46_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB46_1;
+; SM90-NEXT: $L__BB46_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_monotonic_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB47_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB47_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB47_1;
+; SM90-NEXT: $L__BB47_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB48_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB48_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB48_1;
+; SM90-NEXT: $L__BB48_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB49_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB49_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB49_1;
+; SM90-NEXT: $L__BB49_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_acquire_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB50_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB50_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB50_1;
+; SM90-NEXT: $L__BB50_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic acquire
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB51_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB51_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB51_1;
+; SM90-NEXT: $L__BB51_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB52_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB52_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB52_1;
+; SM90-NEXT: $L__BB52_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: monotonic_seq_cst_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB53_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB53_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB53_1;
+; SM90-NEXT: $L__BB53_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new monotonic seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB54_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB54_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB54_1;
+; SM90-NEXT: $L__BB54_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB55_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB55_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB55_1;
+; SM90-NEXT: $L__BB55_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_monotonic_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB56_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB56_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB56_1;
+; SM90-NEXT: $L__BB56_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB57_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB57_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB57_1;
+; SM90-NEXT: $L__BB57_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB58_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB58_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB58_1;
+; SM90-NEXT: $L__BB58_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_acquire_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB59_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB59_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB59_1;
+; SM90-NEXT: $L__BB59_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire acquire
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB60_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB60_1;
+; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB61_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB61_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB61_1;
+; SM90-NEXT: $L__BB61_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acquire_seq_cst_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB62_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB62_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB62_1;
+; SM90-NEXT: $L__BB62_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acquire seq_cst
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB63_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB63_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB63_1;
+; SM90-NEXT: $L__BB63_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB64_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB64_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB64_1;
+; SM90-NEXT: $L__BB64_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release monotonic
+ ret i16 %new
+}
+
+define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_monotonic_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB65_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB65_1;
+; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release monotonic
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB66_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB66_1;
+; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB67_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB67_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB67_1;
+; SM90-NEXT: $L__BB67_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release acquire
+ ret i16 %new
+}
+
+define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_acquire_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB68_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB68_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB68_1;
+; SM90-NEXT: $L__BB68_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release acquire
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB69_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB69_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB69_1;
+; SM90-NEXT: $L__BB69_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB70_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB70_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB70_1;
+; SM90-NEXT: $L__BB70_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new release seq_cst
+ ret i16 %new
+}
+
+define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: release_seq_cst_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB71_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB71_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB71_1;
+; SM90-NEXT: $L__BB71_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new release seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB72_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB72_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB72_1;
+; SM90-NEXT: $L__BB72_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB73_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB73_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB73_1;
+; SM90-NEXT: $L__BB73_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_monotonic_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB74_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB74_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB74_1;
+; SM90-NEXT: $L__BB74_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB75_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB75_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB75_1;
+; SM90-NEXT: $L__BB75_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB76_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB76_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB76_1;
+; SM90-NEXT: $L__BB76_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_acquire_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB77_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB77_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB77_1;
+; SM90-NEXT: $L__BB77_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel acquire
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB78_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB78_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB78_1;
+; SM90-NEXT: $L__BB78_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB79_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB79_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB79_1;
+; SM90-NEXT: $L__BB79_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB80_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB80_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB80_1;
+; SM90-NEXT: $L__BB80_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new acq_rel seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB81_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB81_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB81_1;
+; SM90-NEXT: $L__BB81_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB82_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB82_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB82_1;
+; SM90-NEXT: $L__BB82_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_monotonic_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB83_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB83_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB83_1;
+; SM90-NEXT: $L__BB83_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst monotonic
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB84_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB84_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB84_1;
+; SM90-NEXT: $L__BB84_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB85_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB85_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB85_1;
+; SM90-NEXT: $L__BB85_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_acquire_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB86_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB86_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB86_1;
+; SM90-NEXT: $L__BB86_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst acquire
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_generic(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB87_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB87_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB87_1;
+; SM90-NEXT: $L__BB87_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_global(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.global.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB88_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB88_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB88_1;
+; SM90-NEXT: $L__BB88_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+ ret i16 %new
+}
+
+define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i16_shared(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.shared.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB89_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB89_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB89_1;
+; SM90-NEXT: $L__BB89_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i16 %cmp, i16 %new seq_cst seq_cst
+ ret i16 %new
+}
+
+define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2];
+; SM90-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2];
+; SM90-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_monotonic_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2];
+; SM90-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic monotonic
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_acquire_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic acquire
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: monotonic_seq_cst_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new monotonic seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_monotonic_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire monotonic
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_acquire_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire acquire
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acquire_seq_cst_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acquire seq_cst
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2];
+; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2];
+; SM90-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release monotonic
+ ret i32 %new
+}
+
+define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_monotonic_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2];
+; SM90-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release monotonic
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2];
+; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2];
+; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release acquire
+ ret i32 %new
+}
+
+define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_acquire_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2];
+; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release acquire
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new release seq_cst
+ ret i32 %new
+}
+
+define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: release_seq_cst_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new release seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2];
+; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2];
+; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_monotonic_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2];
+; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel monotonic
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2];
+; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2];
+; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_acquire_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2];
+; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new acq_rel seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_monotonic_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_acquire_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst acquire
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_generic(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_global(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ ret i32 %new
+}
+
+define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i32_shared(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ ret i32 %new
+}
+
+define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2];
+; SM90-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2];
+; SM90-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_monotonic_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2];
+; SM90-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic monotonic
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_acquire_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic acquire
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: monotonic_seq_cst_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new monotonic seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_monotonic_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire monotonic
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_acquire_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire acquire
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acquire_seq_cst_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acquire seq_cst
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2];
+; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2];
+; SM90-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release monotonic
+ ret i64 %new
+}
+
+define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_monotonic_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2];
+; SM90-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release monotonic
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2];
+; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2];
+; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release acquire
+ ret i64 %new
+}
+
+define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_acquire_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2];
+; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release acquire
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release seq_cst
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new release seq_cst
+ ret i64 %new
+}
+
+define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: release_seq_cst_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new release seq_cst
+ ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2];
+; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel monotonic
+ ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2];
+; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ ret i64 %new
+}
+
+define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_monotonic_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2];
+; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel monotonic
+ ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2];
+; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+ ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2];
+; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel acquire
+ ret i64 %new
+}
+
+define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_acquire_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2];
+; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel acquire
+ ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ ret i64 %new
+}
+
+define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: acq_rel_seq_cst_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new acq_rel seq_cst
+ ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst monotonic
+ ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ ret i64 %new
+}
+
+define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_monotonic_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst monotonic
+ ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst acquire
+ ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst acquire
+ ret i64 %new
+}
+
+define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_acquire_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst acquire
+ ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_generic(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2];
+; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_global(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2];
+; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(1) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ ret i64 %new
+}
+
+define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %new) {
+; SM90-LABEL: seq_cst_seq_cst_i64_shared(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2];
+; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr addrspace(3) %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ ret i64 %new
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index 33a1f15c6a5cd69..aaea0d2ee25efb3 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -7,6 +7,7 @@
; TODO: these are system scope, but are compiled to gpu scope..
; TODO: these are seq_cst, but are compiled to relaxed..
+
; CHECK-LABEL: relaxed_sys_i8
define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-LABEL: relaxed_sys_i8(
@@ -17,86 +18,1153 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM30-NEXT: .reg .b64 %rd<3>;
; SM30-EMPTY:
; SM30-NEXT: // %bb.0:
-; SM30-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
-; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM30-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
+; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM30-NEXT: and.b64 %rd1, %rd2, -4;
+; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT: and.b32 %r10, %r9, 3;
+; SM30-NEXT: shl.b32 %r1, %r10, 3;
+; SM30-NEXT: mov.b32 %r11, 255;
+; SM30-NEXT: shl.b32 %r12, %r11, %r1;
+; SM30-NEXT: not.b32 %r2, %r12;
+; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT: and.b32 %r14, %r13, 255;
+; SM30-NEXT: shl.b32 %r3, %r14, %r1;
+; SM30-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM30-NEXT: shl.b32 %r4, %r15, %r1;
+; SM30-NEXT: ld.u32 %r16, [%rd1];
+; SM30-NEXT: and.b32 %r20, %r16, %r2;
+; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
+; SM30-NEXT: // =>This Inner Loop Header: Depth=1
+; SM30-NEXT: or.b32 %r17, %r20, %r3;
+; SM30-NEXT: or.b32 %r18, %r20, %r4;
+; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT: @%p1 bra $L__BB0_3;
+; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT: // in Loop: Header=BB0_1 Depth=1
+; SM30-NEXT: and.b32 %r8, %r7, %r2;
+; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT: mov.u32 %r20, %r8;
+; SM30-NEXT: @%p2 bra $L__BB0_1;
+; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end
+; SM30-NEXT: st.param.b32 [func_retval0], %r13;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: relaxed_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB0_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB0_1;
+; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+; SM90-LABEL: relaxed_sys_i8(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB0_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB0_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB0_1;
+; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
+ ret i8 %new
+}
+
+define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
+; SM30-LABEL: acquire_sys_i8(
+; SM30: {
+; SM30-NEXT: .reg .pred %p<3>;
+; SM30-NEXT: .reg .b16 %rs<2>;
+; SM30-NEXT: .reg .b32 %r<21>;
+; SM30-NEXT: .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2];
+; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0];
+; SM30-NEXT: and.b64 %rd1, %rd2, -4;
+; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT: and.b32 %r10, %r9, 3;
+; SM30-NEXT: shl.b32 %r1, %r10, 3;
+; SM30-NEXT: mov.b32 %r11, 255;
+; SM30-NEXT: shl.b32 %r12, %r11, %r1;
+; SM30-NEXT: not.b32 %r2, %r12;
+; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT: and.b32 %r14, %r13, 255;
+; SM30-NEXT: shl.b32 %r3, %r14, %r1;
+; SM30-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1];
+; SM30-NEXT: shl.b32 %r4, %r15, %r1;
+; SM30-NEXT: ld.u32 %r16, [%rd1];
+; SM30-NEXT: and.b32 %r20, %r16, %r2;
+; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
+; SM30-NEXT: // =>This Inner Loop Header: Depth=1
+; SM30-NEXT: or.b32 %r17, %r20, %r3;
+; SM30-NEXT: or.b32 %r18, %r20, %r4;
+; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT: @%p1 bra $L__BB1_3;
+; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT: // in Loop: Header=BB1_1 Depth=1
+; SM30-NEXT: and.b32 %r8, %r7, %r2;
+; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT: mov.u32 %r20, %r8;
+; SM30-NEXT: @%p2 bra $L__BB1_1;
+; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: st.param.b32 [func_retval0], %r13;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: acquire_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB1_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB1_1;
+; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+; SM90-LABEL: acquire_sys_i8(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB1_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB1_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB1_1;
+; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acquire acquire
+ ret i8 %new
+}
+
+define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
+; SM30-LABEL: release_sys_i8(
+; SM30: {
+; SM30-NEXT: .reg .pred %p<3>;
+; SM30-NEXT: .reg .b16 %rs<2>;
+; SM30-NEXT: .reg .b32 %r<21>;
+; SM30-NEXT: .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2];
+; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0];
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: and.b64 %rd1, %rd2, -4;
+; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT: and.b32 %r10, %r9, 3;
+; SM30-NEXT: shl.b32 %r1, %r10, 3;
+; SM30-NEXT: mov.b32 %r11, 255;
+; SM30-NEXT: shl.b32 %r12, %r11, %r1;
+; SM30-NEXT: not.b32 %r2, %r12;
+; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT: and.b32 %r14, %r13, 255;
+; SM30-NEXT: shl.b32 %r3, %r14, %r1;
+; SM30-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1];
+; SM30-NEXT: shl.b32 %r4, %r15, %r1;
+; SM30-NEXT: ld.u32 %r16, [%rd1];
+; SM30-NEXT: and.b32 %r20, %r16, %r2;
+; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
+; SM30-NEXT: // =>This Inner Loop Header: Depth=1
+; SM30-NEXT: or.b32 %r17, %r20, %r3;
+; SM30-NEXT: or.b32 %r18, %r20, %r4;
+; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT: @%p1 bra $L__BB2_3;
+; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT: // in Loop: Header=BB2_1 Depth=1
+; SM30-NEXT: and.b32 %r8, %r7, %r2;
+; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT: mov.u32 %r20, %r8;
+; SM30-NEXT: @%p2 bra $L__BB2_1;
+; SM30-NEXT: $L__BB2_3: // %partword.cmpxchg.end
+; SM30-NEXT: st.param.b32 [func_retval0], %r13;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: release_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB2_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB2_1;
+; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+; SM90-LABEL: release_sys_i8(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB2_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB2_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB2_1;
+; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new release monotonic
+ ret i8 %new
+}
+
+define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
+; SM30-LABEL: acq_rel_sys_i8(
+; SM30: {
+; SM30-NEXT: .reg .pred %p<3>;
+; SM30-NEXT: .reg .b16 %rs<2>;
+; SM30-NEXT: .reg .b32 %r<21>;
+; SM30-NEXT: .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
+; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: and.b64 %rd1, %rd2, -4;
+; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT: and.b32 %r10, %r9, 3;
+; SM30-NEXT: shl.b32 %r1, %r10, 3;
+; SM30-NEXT: mov.b32 %r11, 255;
+; SM30-NEXT: shl.b32 %r12, %r11, %r1;
+; SM30-NEXT: not.b32 %r2, %r12;
+; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT: and.b32 %r14, %r13, 255;
+; SM30-NEXT: shl.b32 %r3, %r14, %r1;
+; SM30-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
+; SM30-NEXT: shl.b32 %r4, %r15, %r1;
+; SM30-NEXT: ld.u32 %r16, [%rd1];
+; SM30-NEXT: and.b32 %r20, %r16, %r2;
+; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
+; SM30-NEXT: // =>This Inner Loop Header: Depth=1
+; SM30-NEXT: or.b32 %r17, %r20, %r3;
+; SM30-NEXT: or.b32 %r18, %r20, %r4;
+; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT: @%p1 bra $L__BB3_3;
+; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT: // in Loop: Header=BB3_1 Depth=1
+; SM30-NEXT: and.b32 %r8, %r7, %r2;
+; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT: mov.u32 %r20, %r8;
+; SM30-NEXT: @%p2 bra $L__BB3_1;
+; SM30-NEXT: $L__BB3_3: // %partword.cmpxchg.end
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: st.param.b32 [func_retval0], %r13;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: acq_rel_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB3_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB3_1;
+; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+; SM90-LABEL: acq_rel_sys_i8(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB3_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB3_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB3_1;
+; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new acq_rel acquire
+ ret i8 %new
+}
+
+define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
+; SM30-LABEL: seq_cst_sys_i8(
+; SM30: {
+; SM30-NEXT: .reg .pred %p<3>;
+; SM30-NEXT: .reg .b16 %rs<2>;
+; SM30-NEXT: .reg .b32 %r<21>;
+; SM30-NEXT: .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
+; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: and.b64 %rd1, %rd2, -4;
+; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM30-NEXT: and.b32 %r10, %r9, 3;
+; SM30-NEXT: shl.b32 %r1, %r10, 3;
+; SM30-NEXT: mov.b32 %r11, 255;
+; SM30-NEXT: shl.b32 %r12, %r11, %r1;
+; SM30-NEXT: not.b32 %r2, %r12;
+; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM30-NEXT: and.b32 %r14, %r13, 255;
+; SM30-NEXT: shl.b32 %r3, %r14, %r1;
+; SM30-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
+; SM30-NEXT: shl.b32 %r4, %r15, %r1;
+; SM30-NEXT: ld.u32 %r16, [%rd1];
+; SM30-NEXT: and.b32 %r20, %r16, %r2;
+; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop
+; SM30-NEXT: // =>This Inner Loop Header: Depth=1
+; SM30-NEXT: or.b32 %r17, %r20, %r3;
+; SM30-NEXT: or.b32 %r18, %r20, %r4;
+; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM30-NEXT: @%p1 bra $L__BB4_3;
+; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT: // in Loop: Header=BB4_1 Depth=1
+; SM30-NEXT: and.b32 %r8, %r7, %r2;
+; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM30-NEXT: mov.u32 %r20, %r8;
+; SM30-NEXT: @%p2 bra $L__BB4_1;
+; SM30-NEXT: $L__BB4_3: // %partword.cmpxchg.end
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: st.param.b32 [func_retval0], %r13;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: seq_cst_sys_i8(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
+; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM70-NEXT: @%p1 bra $L__BB4_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
+; SM70-NEXT: @%p2 bra $L__BB4_1;
+; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+; SM90-LABEL: seq_cst_sys_i8(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<21>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM90-NEXT: and.b32 %r10, %r9, 3;
+; SM90-NEXT: shl.b32 %r1, %r10, 3;
+; SM90-NEXT: mov.b32 %r11, 255;
+; SM90-NEXT: shl.b32 %r12, %r11, %r1;
+; SM90-NEXT: not.b32 %r2, %r12;
+; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM90-NEXT: and.b32 %r14, %r13, 255;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
+; SM90-NEXT: shl.b32 %r4, %r15, %r1;
+; SM90-NEXT: ld.u32 %r16, [%rd1];
+; SM90-NEXT: and.b32 %r20, %r16, %r2;
+; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r17, %r20, %r3;
+; SM90-NEXT: or.b32 %r18, %r20, %r4;
+; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r18;
+; SM90-NEXT: @%p1 bra $L__BB4_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB4_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM90-NEXT: mov.u32 %r20, %r8;
+; SM90-NEXT: @%p2 bra $L__BB4_1;
+; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ ret i8 %new
+}
+
+; CHECK-LABEL: relaxed_sys_i16
+define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
+; SM30-LABEL: relaxed_sys_i16(
+; SM30: {
+; SM30-NEXT: .reg .pred %p<3>;
+; SM30-NEXT: .reg .b16 %rs<2>;
+; SM30-NEXT: .reg .b32 %r<20>;
+; SM30-NEXT: .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
+; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM30-NEXT: and.b64 %rd1, %rd2, -4;
+; SM30-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT: and.b32 %r11, %r10, 3;
+; SM30-NEXT: shl.b32 %r1, %r11, 3;
+; SM30-NEXT: mov.b32 %r12, 65535;
+; SM30-NEXT: shl.b32 %r13, %r12, %r1;
+; SM30-NEXT: not.b32 %r2, %r13;
+; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT: shl.b32 %r3, %r14, %r1;
+; SM30-NEXT: shl.b32 %r4, %r9, %r1;
+; SM30-NEXT: ld.u32 %r15, [%rd1];
+; SM30-NEXT: and.b32 %r19, %r15, %r2;
+; SM30-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
+; SM30-NEXT: // =>This Inner Loop Header: Depth=1
+; SM30-NEXT: or.b32 %r16, %r19, %r3;
+; SM30-NEXT: or.b32 %r17, %r19, %r4;
+; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM30-NEXT: @%p1 bra $L__BB5_3;
+; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT: // in Loop: Header=BB5_1 Depth=1
+; SM30-NEXT: and.b32 %r8, %r7, %r2;
+; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM30-NEXT: mov.u32 %r19, %r8;
+; SM30-NEXT: @%p2 bra $L__BB5_1;
+; SM30-NEXT: $L__BB5_3: // %partword.cmpxchg.end
+; SM30-NEXT: st.param.b32 [func_retval0], %r14;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: relaxed_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB5_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB5_1;
+; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+; SM90-LABEL: relaxed_sys_i16(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB5_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB5_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB5_1;
+; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
+ ret i16 %new
+}
+
+define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
+; SM30-LABEL: acquire_sys_i16(
+; SM30: {
+; SM30-NEXT: .reg .pred %p<3>;
+; SM30-NEXT: .reg .b16 %rs<2>;
+; SM30-NEXT: .reg .b32 %r<20>;
+; SM30-NEXT: .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2];
+; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0];
+; SM30-NEXT: and.b64 %rd1, %rd2, -4;
+; SM30-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1];
+; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT: and.b32 %r11, %r10, 3;
+; SM30-NEXT: shl.b32 %r1, %r11, 3;
+; SM30-NEXT: mov.b32 %r12, 65535;
+; SM30-NEXT: shl.b32 %r13, %r12, %r1;
+; SM30-NEXT: not.b32 %r2, %r13;
+; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT: shl.b32 %r3, %r14, %r1;
+; SM30-NEXT: shl.b32 %r4, %r9, %r1;
+; SM30-NEXT: ld.u32 %r15, [%rd1];
+; SM30-NEXT: and.b32 %r19, %r15, %r2;
+; SM30-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
+; SM30-NEXT: // =>This Inner Loop Header: Depth=1
+; SM30-NEXT: or.b32 %r16, %r19, %r3;
+; SM30-NEXT: or.b32 %r17, %r19, %r4;
+; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM30-NEXT: @%p1 bra $L__BB6_3;
+; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT: // in Loop: Header=BB6_1 Depth=1
+; SM30-NEXT: and.b32 %r8, %r7, %r2;
+; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM30-NEXT: mov.u32 %r19, %r8;
+; SM30-NEXT: @%p2 bra $L__BB6_1;
+; SM30-NEXT: $L__BB6_3: // %partword.cmpxchg.end
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: st.param.b32 [func_retval0], %r14;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: acquire_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB6_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB6_1;
+; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+; SM90-LABEL: acquire_sys_i16(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1];
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB6_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB6_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB6_1;
+; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acquire acquire
+ ret i16 %new
+}
+
+define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
+; SM30-LABEL: release_sys_i16(
+; SM30: {
+; SM30-NEXT: .reg .pred %p<3>;
+; SM30-NEXT: .reg .b16 %rs<2>;
+; SM30-NEXT: .reg .b32 %r<20>;
+; SM30-NEXT: .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2];
+; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0];
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1];
+; SM30-NEXT: and.b64 %rd1, %rd2, -4;
+; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT: and.b32 %r11, %r10, 3;
+; SM30-NEXT: shl.b32 %r1, %r11, 3;
+; SM30-NEXT: mov.b32 %r12, 65535;
+; SM30-NEXT: shl.b32 %r13, %r12, %r1;
+; SM30-NEXT: not.b32 %r2, %r13;
+; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM30-NEXT: shl.b32 %r3, %r14, %r1;
+; SM30-NEXT: shl.b32 %r4, %r9, %r1;
+; SM30-NEXT: ld.u32 %r15, [%rd1];
+; SM30-NEXT: and.b32 %r19, %r15, %r2;
+; SM30-NEXT: $L__BB7_1: // %partword.cmpxchg.loop
+; SM30-NEXT: // =>This Inner Loop Header: Depth=1
+; SM30-NEXT: or.b32 %r16, %r19, %r3;
+; SM30-NEXT: or.b32 %r17, %r19, %r4;
+; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM30-NEXT: @%p1 bra $L__BB7_3;
+; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM30-NEXT: // in Loop: Header=BB7_1 Depth=1
+; SM30-NEXT: and.b32 %r8, %r7, %r2;
+; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM30-NEXT: mov.u32 %r19, %r8;
+; SM30-NEXT: @%p2 bra $L__BB7_1;
+; SM30-NEXT: $L__BB7_3: // %partword.cmpxchg.end
+; SM30-NEXT: st.param.b32 [func_retval0], %r14;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: release_sys_i16(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB7_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB7_1;
+; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+; SM90-LABEL: release_sys_i16(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB7_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB7_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB7_1;
+; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new release monotonic
+ ret i16 %new
+}
+
+define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
+; SM30-LABEL: acq_rel_sys_i16(
+; SM30: {
+; SM30-NEXT: .reg .pred %p<3>;
+; SM30-NEXT: .reg .b16 %rs<2>;
+; SM30-NEXT: .reg .b32 %r<20>;
+; SM30-NEXT: .reg .b64 %rd<3>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
+; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
-; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
-; SM30-NEXT: and.b32 %r10, %r9, 3;
-; SM30-NEXT: shl.b32 %r1, %r10, 3;
-; SM30-NEXT: mov.b32 %r11, 255;
-; SM30-NEXT: shl.b32 %r12, %r11, %r1;
-; SM30-NEXT: not.b32 %r2, %r12;
-; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
-; SM30-NEXT: and.b32 %r14, %r13, 255;
+; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM30-NEXT: and.b32 %r11, %r10, 3;
+; SM30-NEXT: shl.b32 %r1, %r11, 3;
+; SM30-NEXT: mov.b32 %r12, 65535;
+; SM30-NEXT: shl.b32 %r13, %r12, %r1;
+; SM30-NEXT: not.b32 %r2, %r13;
+; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
-; SM30-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1];
-; SM30-NEXT: shl.b32 %r4, %r15, %r1;
-; SM30-NEXT: ld.u32 %r16, [%rd1];
-; SM30-NEXT: and.b32 %r20, %r16, %r2;
-; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
+; SM30-NEXT: shl.b32 %r4, %r9, %r1;
+; SM30-NEXT: ld.u32 %r15, [%rd1];
+; SM30-NEXT: and.b32 %r19, %r15, %r2;
+; SM30-NEXT: $L__BB8_1: // %partword.cmpxchg.loop
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
-; SM30-NEXT: or.b32 %r17, %r20, %r3;
-; SM30-NEXT: or.b32 %r18, %r20, %r4;
-; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM30-NEXT: setp.eq.s32 %p1, %r7, %r18;
-; SM30-NEXT: @%p1 bra $L__BB0_3;
+; SM30-NEXT: or.b32 %r16, %r19, %r3;
+; SM30-NEXT: or.b32 %r17, %r19, %r4;
+; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM30-NEXT: @%p1 bra $L__BB8_3;
; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure
-; SM30-NEXT: // in Loop: Header=BB0_1 Depth=1
+; SM30-NEXT: // in Loop: Header=BB8_1 Depth=1
; SM30-NEXT: and.b32 %r8, %r7, %r2;
-; SM30-NEXT: setp.ne.s32 %p2, %r20, %r8;
-; SM30-NEXT: mov.u32 %r20, %r8;
-; SM30-NEXT: @%p2 bra $L__BB0_1;
-; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end
-; SM30-NEXT: st.param.b32 [func_retval0], %r13;
+; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM30-NEXT: mov.u32 %r19, %r8;
+; SM30-NEXT: @%p2 bra $L__BB8_1;
+; SM30-NEXT: $L__BB8_3: // %partword.cmpxchg.end
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: st.param.b32 [func_retval0], %r14;
; SM30-NEXT: ret;
;
-; SM70-LABEL: relaxed_sys_i8(
+; SM70-LABEL: acq_rel_sys_i16(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
-; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b32 %r<20>;
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
+; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
-; SM70-NEXT: and.b32 %r10, %r9, 3;
-; SM70-NEXT: shl.b32 %r1, %r10, 3;
-; SM70-NEXT: mov.b32 %r11, 255;
-; SM70-NEXT: shl.b32 %r12, %r11, %r1;
-; SM70-NEXT: not.b32 %r2, %r12;
-; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
-; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
-; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1];
-; SM70-NEXT: shl.b32 %r4, %r15, %r1;
-; SM70-NEXT: ld.u32 %r16, [%rd1];
-; SM70-NEXT: and.b32 %r20, %r16, %r2;
-; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b32 %r17, %r20, %r3;
-; SM70-NEXT: or.b32 %r18, %r20, %r4;
-; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
-; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
-; SM70-NEXT: @%p1 bra $L__BB0_3;
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB8_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1
+; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
-; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
-; SM70-NEXT: mov.u32 %r20, %r8;
-; SM70-NEXT: @%p2 bra $L__BB0_1;
-; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end
-; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB8_1;
+; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
- ret i8 %new
+; SM90-LABEL: acq_rel_sys_i16(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB8_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB8_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB8_1;
+; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new acq_rel acquire
+ ret i16 %new
}
-; CHECK-LABEL: relaxed_sys_i16
-define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
-; SM30-LABEL: relaxed_sys_i16(
+; CHECK-LABEL: seq_cst_sys_i16
+define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
+; SM30-LABEL: seq_cst_sys_i16(
; SM30: {
; SM30-NEXT: .reg .pred %p<3>;
; SM30-NEXT: .reg .b16 %rs<2>;
@@ -104,10 +1172,11 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
; SM30-NEXT: .reg .b64 %rd<3>;
; SM30-EMPTY:
; SM30-NEXT: // %bb.0:
-; SM30-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
-; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM30-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
+; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
-; SM30-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1];
; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
; SM30-NEXT: and.b32 %r11, %r10, 3;
; SM30-NEXT: shl.b32 %r1, %r11, 3;
@@ -119,24 +1188,25 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
; SM30-NEXT: shl.b32 %r4, %r9, %r1;
; SM30-NEXT: ld.u32 %r15, [%rd1];
; SM30-NEXT: and.b32 %r19, %r15, %r2;
-; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
+; SM30-NEXT: $L__BB9_1: // %partword.cmpxchg.loop
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
; SM30-NEXT: or.b32 %r16, %r19, %r3;
; SM30-NEXT: or.b32 %r17, %r19, %r4;
; SM30-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
; SM30-NEXT: setp.eq.s32 %p1, %r7, %r17;
-; SM30-NEXT: @%p1 bra $L__BB1_3;
+; SM30-NEXT: @%p1 bra $L__BB9_3;
; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure
-; SM30-NEXT: // in Loop: Header=BB1_1 Depth=1
+; SM30-NEXT: // in Loop: Header=BB9_1 Depth=1
; SM30-NEXT: and.b32 %r8, %r7, %r2;
; SM30-NEXT: setp.ne.s32 %p2, %r19, %r8;
; SM30-NEXT: mov.u32 %r19, %r8;
-; SM30-NEXT: @%p2 bra $L__BB1_1;
-; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end
+; SM30-NEXT: @%p2 bra $L__BB9_1;
+; SM30-NEXT: $L__BB9_3: // %partword.cmpxchg.end
+; SM30-NEXT: membar.sys;
; SM30-NEXT: st.param.b32 [func_retval0], %r14;
; SM30-NEXT: ret;
;
-; SM70-LABEL: relaxed_sys_i16(
+; SM70-LABEL: seq_cst_sys_i16(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
; SM70-NEXT: .reg .b16 %rs<2>;
@@ -144,10 +1214,11 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
-; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
-; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1];
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
; SM70-NEXT: and.b32 %r11, %r10, 3;
; SM70-NEXT: shl.b32 %r1, %r11, 3;
@@ -159,23 +1230,65 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
; SM70-NEXT: ld.u32 %r15, [%rd1];
; SM70-NEXT: and.b32 %r19, %r15, %r2;
-; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
+; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
; SM70-NEXT: or.b32 %r16, %r19, %r3;
; SM70-NEXT: or.b32 %r17, %r19, %r4;
-; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: atom.relaxed.cas.b32 %r7, [%rd1], %r17, %r16;
; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
-; SM70-NEXT: @%p1 bra $L__BB1_3;
+; SM70-NEXT: @%p1 bra $L__BB9_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
-; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1
+; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1
; SM70-NEXT: and.b32 %r8, %r7, %r2;
; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
; SM70-NEXT: mov.u32 %r19, %r8;
-; SM70-NEXT: @%p2 bra $L__BB1_1;
-; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end
+; SM70-NEXT: @%p2 bra $L__BB9_1;
+; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end
+; SM70-NEXT: fence.acq_rel.sys;
; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
+; SM90-LABEL: seq_cst_sys_i16(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<2>;
+; SM90-NEXT: .reg .b32 %r<20>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM90-NEXT: and.b32 %r11, %r10, 3;
+; SM90-NEXT: shl.b32 %r1, %r11, 3;
+; SM90-NEXT: mov.b32 %r12, 65535;
+; SM90-NEXT: shl.b32 %r13, %r12, %r1;
+; SM90-NEXT: not.b32 %r2, %r13;
+; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM90-NEXT: shl.b32 %r3, %r14, %r1;
+; SM90-NEXT: shl.b32 %r4, %r9, %r1;
+; SM90-NEXT: ld.u32 %r15, [%rd1];
+; SM90-NEXT: and.b32 %r19, %r15, %r2;
+; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: or.b32 %r16, %r19, %r3;
+; SM90-NEXT: or.b32 %r17, %r19, %r4;
+; SM90-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM90-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM90-NEXT: @%p1 bra $L__BB9_3;
+; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM90-NEXT: // in Loop: Header=BB9_1 Depth=1
+; SM90-NEXT: and.b32 %r8, %r7, %r2;
+; SM90-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM90-NEXT: mov.u32 %r19, %r8;
+; SM90-NEXT: @%p2 bra $L__BB9_1;
+; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
ret i16 %new
}
@@ -203,13 +1316,197 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
; SM70-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1];
; SM70-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2];
-; SM70-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
+; SM90-LABEL: relaxed_sys_i32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2];
+; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
%pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
ret i32 %new
}
+define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
+; SM30-LABEL: acq_rel_sys_i32(
+; SM30: {
+; SM30-NEXT: .reg .b32 %r<4>;
+; SM30-NEXT: .reg .b64 %rd<2>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
+; SM30-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
+; SM30-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
+; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM30-NEXT: st.param.b32 [func_retval0], %r2;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: acq_rel_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
+; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+; SM90-LABEL: acq_rel_sys_i32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
+; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acq_rel acquire
+ ret i32 %new
+}
+
+define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
+; SM30-LABEL: acquire_sys_i32(
+; SM30: {
+; SM30-NEXT: .reg .b32 %r<4>;
+; SM30-NEXT: .reg .b64 %rd<2>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0];
+; SM30-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1];
+; SM30-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2];
+; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM30-NEXT: st.param.b32 [func_retval0], %r2;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: acquire_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+; SM90-LABEL: acquire_sys_i32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2];
+; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new acquire acquire
+ ret i32 %new
+}
+
+define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
+; SM30-LABEL: release_sys_i32(
+; SM30: {
+; SM30-NEXT: .reg .b32 %r<4>;
+; SM30-NEXT: .reg .b64 %rd<2>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0];
+; SM30-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1];
+; SM30-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2];
+; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM30-NEXT: st.param.b32 [func_retval0], %r2;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: release_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0];
+; SM70-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2];
+; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+; SM90-LABEL: release_sys_i32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0];
+; SM90-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2];
+; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new release monotonic
+ ret i32 %new
+}
+
+define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
+; SM30-LABEL: seq_cst_sys_i32(
+; SM30: {
+; SM30-NEXT: .reg .b32 %r<4>;
+; SM30-NEXT: .reg .b64 %rd<2>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
+; SM30-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
+; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM30-NEXT: st.param.b32 [func_retval0], %r2;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: seq_cst_sys_i32(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
+; SM70-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
+; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+; SM90-LABEL: seq_cst_sys_i32(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
+; SM90-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
+; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ ret i32 %new
+}
+
; CHECK-LABEL: relaxed_sys_i64
define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
; SM30-LABEL: relaxed_sys_i64(
@@ -232,11 +1529,183 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
; SM70-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
-; SM70-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
+; SM90-LABEL: relaxed_sys_i64(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
+; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
%pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
ret i64 %new
}
+
+define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
+; SM30-LABEL: acquire_sys_i64(
+; SM30: {
+; SM30-NEXT: .reg .b64 %rd<5>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0];
+; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1];
+; SM30-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2];
+; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM30-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: acquire_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+; SM90-LABEL: acquire_sys_i64(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2];
+; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acquire acquire
+ ret i64 %new
+}
+
+define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
+; SM30-LABEL: acq_rel_sys_i64(
+; SM30: {
+; SM30-NEXT: .reg .b64 %rd<5>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
+; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
+; SM30-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
+; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM30-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: acq_rel_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
+; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+; SM90-LABEL: acq_rel_sys_i64(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
+; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new acq_rel acquire
+ ret i64 %new
+}
+
+define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
+; SM30-LABEL: release_sys_i64(
+; SM30: {
+; SM30-NEXT: .reg .b64 %rd<5>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0];
+; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1];
+; SM30-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2];
+; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM30-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: release_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0];
+; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2];
+; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+; SM90-LABEL: release_sys_i64(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0];
+; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2];
+; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new release monotonic
+ ret i64 %new
+}
+
+define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
+; SM30-LABEL: seq_cst_sys_i64(
+; SM30: {
+; SM30-NEXT: .reg .b64 %rd<5>;
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
+; SM30-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
+; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM30-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM30-NEXT: ret;
+;
+; SM70-LABEL: seq_cst_sys_i64(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
+; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
+; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+; SM90-LABEL: seq_cst_sys_i64(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
+; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
+; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ ret i64 %new
+}
+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py
new file mode 100644
index 000000000000000..bf787406b76c01f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py
@@ -0,0 +1,46 @@
+# For manual usage, not as a part of lit tests. Used for generating the following tests:
+# cmpxchg-sm30.ll, cmpxchg-sm70.ll, cmpxchg-sm90.ll
+
+from string import Template
+from itertools import product
+
+cmpxchg_func = Template(
+"""define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
+ %pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure
+ ret i$size %new
+}
+"""
+)
+
+run_statement = Template(
+ """; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %}
+"""
+)
+
+TESTS = [(60, 50), (70, 63), (90, 87)]
+
+LLVM_SCOPES = ["", "block", "cluster", "device"]
+
+SCOPE_LLVM_TO_PTX = {"": "sys", "block": "cta", "cluster": "cluster", "device": "gpu"}
+
+SUCCESS_ORDERINGS = ["monotonic", "acquire", "release", "acq_rel", "seq_cst"]
+
+FAILURE_ORDERINGS = ["monotonic", "acquire", "seq_cst"]
+
+SIZES = [8, 16, 32, 64]
+
+ADDRSPACES = [0, 1, 3]
+
+ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"}
+
+if __name__ == "__main__":
+ for sm, ptx in TESTS:
+ with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp:
+ print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
+ for size, success, failure, addrspace in product(SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES):
+ if addrspace == 0:
+ addrspace_cast = ""
+ else:
+ addrspace_cast = " addrspace({})".format(str(addrspace))
+ print(cmpxchg_func.substitute(success=success, failure=failure, size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], addrspace_cast=addrspace_cast), file=fp)
diff --git a/llvm/test/CodeGen/NVPTX/lit.local.cfg b/llvm/test/CodeGen/NVPTX/lit.local.cfg
index 54a6c338bdf85af..84cce669ec10b48 100644
--- a/llvm/test/CodeGen/NVPTX/lit.local.cfg
+++ b/llvm/test/CodeGen/NVPTX/lit.local.cfg
@@ -1,4 +1,4 @@
if not "NVPTX" in config.root.targets:
config.unsupported = True
config.suffixes.add(".py")
-config.excludes = ["fence.py"]
+config.excludes = ["fence.py", "cmpxchg.py"]
>From 87d17e5da7c516ae43d9396fabad0c266d3b1903 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Fri, 7 Feb 2025 00:02:54 +0000
Subject: [PATCH 2/5] clang-format, black
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 19 +++++++++++--------
llvm/test/CodeGen/NVPTX/cmpxchg.py | 17 ++++++++++++++---
2 files changed, 25 insertions(+), 11 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a74eada0afcebab..db5f45fb7d09c85 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5603,9 +5603,9 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
if (isa<AtomicCmpXchgInst>(Inst)) {
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
if (isReleaseOrStronger(Ord))
- return Ord == AtomicOrdering::SequentiallyConsistent ?
- Builder.CreateFence(AtomicOrdering::SequentiallyConsistent) :
- Builder.CreateFence(AtomicOrdering::Release);
+ return Ord == AtomicOrdering::SequentiallyConsistent
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
+ : Builder.CreateFence(AtomicOrdering::Release);
} else {
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
}
@@ -5617,13 +5617,16 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
AtomicOrdering Ord) const {
// Specialize for cmpxchg
if (isa<AtomicCmpXchgInst>(Inst)) {
- auto CASWidth= cast<IntegerType>(dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())->getBitWidth();
+ auto CASWidth =
+ cast<IntegerType>(
+ dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())
+ ->getBitWidth();
// Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
if (isAcquireOrStronger(Ord))
- return (Ord == AtomicOrdering::SequentiallyConsistent
- && CASWidth >= STI.getMinCmpXchgSizeInBits()) ?
- nullptr :
- Builder.CreateFence(AtomicOrdering::Acquire);
+ return (Ord == AtomicOrdering::SequentiallyConsistent &&
+ CASWidth >= STI.getMinCmpXchgSizeInBits())
+ ? nullptr
+ : Builder.CreateFence(AtomicOrdering::Acquire);
} else {
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
}
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.py b/llvm/test/CodeGen/NVPTX/cmpxchg.py
index bf787406b76c01f..ae7450015ecd2eb 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.py
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.py
@@ -5,7 +5,7 @@
from itertools import product
cmpxchg_func = Template(
-"""define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
+ """define i$size @${success}_${failure}_i${size}_${addrspace}(ptr${addrspace_cast} %addr, i$size %cmp, i$size %new) {
%pairold = cmpxchg ptr${addrspace_cast} %addr, i$size %cmp, i$size %new $success $failure
ret i$size %new
}
@@ -38,9 +38,20 @@
for sm, ptx in TESTS:
with open("cmpxchg-sm{}.ll".format(str(sm)), "w") as fp:
print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
- for size, success, failure, addrspace in product(SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES):
+ for size, success, failure, addrspace in product(
+ SIZES, SUCCESS_ORDERINGS, FAILURE_ORDERINGS, ADDRSPACES
+ ):
if addrspace == 0:
addrspace_cast = ""
else:
addrspace_cast = " addrspace({})".format(str(addrspace))
- print(cmpxchg_func.substitute(success=success, failure=failure, size=size, addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace], addrspace_cast=addrspace_cast), file=fp)
+ print(
+ cmpxchg_func.substitute(
+ success=success,
+ failure=failure,
+ size=size,
+ addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+ addrspace_cast=addrspace_cast,
+ ),
+ file=fp,
+ )
>From e6949e95fca01fce6726778e32aa49b8afcd8629 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Fri, 7 Feb 2025 01:37:58 +0000
Subject: [PATCH 3/5] fix tests: SM < 70 does not support memory order, emit
explicit atom.cas.relaxed for monotonic
---
llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 8 ++++----
llvm/test/CodeGen/NVPTX/atomics.ll | 6 +++---
2 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index 67abfe8295a6231..9027bd6a1478053 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -71,7 +71,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
-; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r6, [%r1], %r54, %r32;
; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
@@ -87,7 +87,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX71-NEXT: atom.relaxed.cas.b32 %r9, [%r1], %r55, %r37;
; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
@@ -109,7 +109,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX71-NEXT: atom.relaxed.global.cas.b32 %r15, [%r10], %r56, %r45;
; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
@@ -131,7 +131,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
-; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX71-NEXT: atom.relaxed.shared.cas.b32 %r21, [%r16], %r57, %r53;
; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index ac2bd8828b20579..fccc3a8844ffc4f 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -177,7 +177,7 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
; CHECK-LABEL: atomicrmw_add_f16_generic
define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
-; CHECK: atom.acquire.cas
+; CHECK: atom.cas
%ret = atomicrmw fadd ptr %addr, half %val seq_cst
ret half %ret
}
@@ -198,14 +198,14 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
; CHECK-LABEL: atomic_cmpxchg_i32
define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) {
-; CHECK: atom.acquire.cas.b32
+; CHECK: atom.cas.b32
%pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
ret i32 %new
}
; CHECK-LABEL: atomic_cmpxchg_i64
define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) {
-; CHECK: atom.acquire.cas.b64
+; CHECK: atom.cas.b64
%pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
ret i64 %new
}
>From c2735020e08ee51ce5fcf1dce12c4f3760370127 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Fri, 7 Feb 2025 01:40:49 +0000
Subject: [PATCH 4/5] expect a fence for seq_cst ordering
---
llvm/test/CodeGen/NVPTX/atomics.ll | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index fccc3a8844ffc4f..b7b459c44bef549 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -177,6 +177,7 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
; CHECK-LABEL: atomicrmw_add_f16_generic
define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
+; CHECK: membar.sys
; CHECK: atom.cas
%ret = atomicrmw fadd ptr %addr, half %val seq_cst
ret half %ret
@@ -198,6 +199,7 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
; CHECK-LABEL: atomic_cmpxchg_i32
define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) {
+; CHECK: membar.sys
; CHECK: atom.cas.b32
%pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
ret i32 %new
@@ -205,6 +207,7 @@ define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) {
; CHECK-LABEL: atomic_cmpxchg_i64
define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) {
+; CHECK: membar.sys
; CHECK: atom.cas.b64
%pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
ret i64 %new
>From 9a36c96cb5271c9724bd502b911ef3c77b694c0e Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Sat, 8 Feb 2025 00:55:40 +0000
Subject: [PATCH 5/5] address review comments
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 43 ++-
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 29 +-
llvm/test/CodeGen/NVPTX/atomics.ll | 362 ++++++++++++++++++--
3 files changed, 360 insertions(+), 74 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index db5f45fb7d09c85..f2b4bc3a4b5613f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5599,16 +5599,16 @@ AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
- // Specialize for cmpxchg
- if (isa<AtomicCmpXchgInst>(Inst)) {
- // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
- if (isReleaseOrStronger(Ord))
- return Ord == AtomicOrdering::SequentiallyConsistent
- ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
- : Builder.CreateFence(AtomicOrdering::Release);
- } else {
+ if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
- }
+
+ // Specialize for cmpxchg
+ // Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
+ if (isReleaseOrStronger(Ord))
+ return Ord == AtomicOrdering::SequentiallyConsistent
+ ? Builder.CreateFence(AtomicOrdering::SequentiallyConsistent)
+ : Builder.CreateFence(AtomicOrdering::Release);
+
return nullptr;
}
@@ -5616,20 +5616,19 @@ Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
// Specialize for cmpxchg
- if (isa<AtomicCmpXchgInst>(Inst)) {
- auto CASWidth =
- cast<IntegerType>(
- dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())
- ->getBitWidth();
- // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
- if (isAcquireOrStronger(Ord))
- return (Ord == AtomicOrdering::SequentiallyConsistent &&
- CASWidth >= STI.getMinCmpXchgSizeInBits())
- ? nullptr
- : Builder.CreateFence(AtomicOrdering::Acquire);
- } else {
+ if (!isa<AtomicCmpXchgInst>(Inst))
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
- }
+
+ auto CASWidth =
+ cast<IntegerType>(
+ dyn_cast<AtomicCmpXchgInst>(Inst)->getCompareOperand()->getType())
+ ->getBitWidth();
+ // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
+ if (isAcquireOrStronger(Ord) &&
+ (Ord != AtomicOrdering::SequentiallyConsistent ||
+ CASWidth < STI.getMinCmpXchgSizeInBits()))
+ return Builder.CreateFence(AtomicOrdering::Acquire);
+
return nullptr;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 14744411bc74bcd..1e88753e356e53d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2402,36 +2402,23 @@ foreach size = ["i16", "i32", "i64"] in {
}
}
-multiclass INT_PTX_ATOM_CAS_AS<string atomic_cmp_swap_without_as, string type, string order> {
+multiclass INT_PTX_ATOM_CAS_AS<string atomic_cmp_swap_without_as, string type, string order, list<Predicate> preds> {
// eg. with type = 32, order = .acquire, atomic_cmp_swap_without_as = atomic_cmp_swap_i32_acquire, and _GEN becomes
// F_ATOMIC_3<i32, Int32Regs, ".acquire", ".b32", ".cas", atomic_cmp_swap_i32_acquire, i32imm>
// For SM70, PTX63 +, memory orders are supported
defm _GEN: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, "", ".b"#type, ".cas",
- !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm"), preds>;
defm _GEN_USE_G: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas",
- !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm"), preds>;
defm _G: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".global", ".b"#type, ".cas",
- !cast<PatFrag>(atomic_cmp_swap_without_as#_g), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_g), !cast<Operand>("i"#type#"imm"), preds>;
defm _S: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), order, ".shared", ".b"#type, ".cas",
- !cast<PatFrag>(atomic_cmp_swap_without_as#_s), !cast<Operand>("i"#type#"imm"), [hasSM<70>, hasPTX<63>]>;
-
- // For older archs, we fall back to lowering to relaxed atom.cas
- defm _GEN_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", "", ".b"#type, ".cas",
- !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm")>;
-
- defm _GEN_USE_G_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas",
- !cast<PatFrag>(atomic_cmp_swap_without_as#_gen), !cast<Operand>("i"#type#"imm")>;
-
- defm _G_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".global", ".b"#type, ".cas",
- !cast<PatFrag>(atomic_cmp_swap_without_as#_g), !cast<Operand>("i"#type#"imm")>;
-
- defm _S_OLD: F_ATOMIC_3<!cast<ValueType>("i"#type), !cast<NVPTXRegClass>("Int"#type#"Regs"), "", ".shared", ".b"#type, ".cas",
- !cast<PatFrag>(atomic_cmp_swap_without_as#_s), !cast<Operand>("i"#type#"imm")>;
+ !cast<PatFrag>(atomic_cmp_swap_without_as#_s), !cast<Operand>("i"#type#"imm"), preds>;
}
// Define atom.cas for all combinations of size x memory order supported in PTX *and* on the hardware.
@@ -2442,9 +2429,11 @@ foreach size = ["32", "64"] in {
// Note that AtomicExpand will convert cmpxchg seq_cst to a cmpxchg monotonic with fences around it.
foreach order = ["acquire", "release", "acq_rel", ""] in {
if !eq(order, "") then {
- defm INT_PTX_ATOM_CAS_#size : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, ".relaxed">;
+ defm INT_PTX_ATOM_CAS_#size : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, ".relaxed", [hasSM<70>, hasPTX<63>]>;
+ defm INT_PTX_ATOM_CAS_#size#_old : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size, size, "", []>;
} else {
- defm INT_PTX_ATOM_CAS_#size#_#order : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "."#order>;
+ defm INT_PTX_ATOM_CAS_#size#_#order : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "."#order, [hasSM<70>, hasPTX<63>]>;
+ defm INT_PTX_ATOM_CAS_#size#_#order#_old : INT_PTX_ATOM_CAS_AS<"atomic_cmp_swap_i"#size#_#order, size, "", []>;
}
}
}
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index b7b459c44bef549..2f58d279f82c34d 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -1,47 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_32 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_32 | %ptxas-verify %}
; CHECK-LABEL: atom0
define i32 @atom0(ptr %addr, i32 %val) {
-; CHECK: atom.add.u32
+; CHECK-LABEL: atom0(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom0_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [atom0_param_1];
+; CHECK-NEXT: atom.add.u32 %r2, [%rd1], %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%ret = atomicrmw add ptr %addr, i32 %val seq_cst
ret i32 %ret
}
; CHECK-LABEL: atom1
define i64 @atom1(ptr %addr, i64 %val) {
-; CHECK: atom.add.u64
+; CHECK-LABEL: atom1(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom1_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [atom1_param_1];
+; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%ret = atomicrmw add ptr %addr, i64 %val seq_cst
ret i64 %ret
}
; CHECK-LABEL: atom2
define i32 @atom2(ptr %subr, i32 %val) {
-; CHECK: neg.s32
-; CHECK: atom.add.u32
+; CHECK-LABEL: atom2(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom2_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [atom2_param_1];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .s32 temp;
+; CHECK-NEXT: neg.s32 temp, %r1;
+; CHECK-NEXT: atom.add.u32 %r2, [%rd1], temp;
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%ret = atomicrmw sub ptr %subr, i32 %val seq_cst
ret i32 %ret
}
; CHECK-LABEL: atom3
define i64 @atom3(ptr %subr, i64 %val) {
-; CHECK: neg.s64
-; CHECK: atom.add.u64
+; CHECK-LABEL: atom3(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom3_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [atom3_param_1];
+; CHECK-NEXT: {
+; CHECK-NEXT: .reg .s64 temp;
+; CHECK-NEXT: neg.s64 temp, %rd2;
+; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], temp;
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%ret = atomicrmw sub ptr %subr, i64 %val seq_cst
ret i64 %ret
}
; CHECK-LABEL: atom4
define i32 @atom4(ptr %subr, i32 %val) {
-; CHECK: atom.and.b32
+; CHECK-LABEL: atom4(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom4_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [atom4_param_1];
+; CHECK-NEXT: atom.and.b32 %r2, [%rd1], %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%ret = atomicrmw and ptr %subr, i32 %val seq_cst
ret i32 %ret
}
; CHECK-LABEL: atom5
define i64 @atom5(ptr %subr, i64 %val) {
-; CHECK: atom.and.b64
+; CHECK-LABEL: atom5(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom5_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [atom5_param_1];
+; CHECK-NEXT: atom.and.b64 %rd3, [%rd1], %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%ret = atomicrmw and ptr %subr, i64 %val seq_cst
ret i64 %ret
}
@@ -59,84 +123,198 @@ define i64 @atom5(ptr %subr, i64 %val) {
; CHECK-LABEL: atom8
define i32 @atom8(ptr %subr, i32 %val) {
-; CHECK: atom.or.b32
+; CHECK-LABEL: atom8(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom8_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [atom8_param_1];
+; CHECK-NEXT: atom.or.b32 %r2, [%rd1], %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%ret = atomicrmw or ptr %subr, i32 %val seq_cst
ret i32 %ret
}
; CHECK-LABEL: atom9
define i64 @atom9(ptr %subr, i64 %val) {
-; CHECK: atom.or.b64
+; CHECK-LABEL: atom9(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom9_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [atom9_param_1];
+; CHECK-NEXT: atom.or.b64 %rd3, [%rd1], %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%ret = atomicrmw or ptr %subr, i64 %val seq_cst
ret i64 %ret
}
; CHECK-LABEL: atom10
define i32 @atom10(ptr %subr, i32 %val) {
-; CHECK: atom.xor.b32
+; CHECK-LABEL: atom10(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom10_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [atom10_param_1];
+; CHECK-NEXT: atom.xor.b32 %r2, [%rd1], %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%ret = atomicrmw xor ptr %subr, i32 %val seq_cst
ret i32 %ret
}
; CHECK-LABEL: atom11
define i64 @atom11(ptr %subr, i64 %val) {
-; CHECK: atom.xor.b64
+; CHECK-LABEL: atom11(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom11_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [atom11_param_1];
+; CHECK-NEXT: atom.xor.b64 %rd3, [%rd1], %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%ret = atomicrmw xor ptr %subr, i64 %val seq_cst
ret i64 %ret
}
; CHECK-LABEL: atom12
define i32 @atom12(ptr %subr, i32 %val) {
-; CHECK: atom.max.s32
+; CHECK-LABEL: atom12(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom12_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [atom12_param_1];
+; CHECK-NEXT: atom.max.s32 %r2, [%rd1], %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%ret = atomicrmw max ptr %subr, i32 %val seq_cst
ret i32 %ret
}
; CHECK-LABEL: atom13
define i64 @atom13(ptr %subr, i64 %val) {
-; CHECK: atom.max.s64
+; CHECK-LABEL: atom13(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom13_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [atom13_param_1];
+; CHECK-NEXT: atom.max.s64 %rd3, [%rd1], %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%ret = atomicrmw max ptr %subr, i64 %val seq_cst
ret i64 %ret
}
; CHECK-LABEL: atom14
define i32 @atom14(ptr %subr, i32 %val) {
-; CHECK: atom.min.s32
+; CHECK-LABEL: atom14(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom14_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [atom14_param_1];
+; CHECK-NEXT: atom.min.s32 %r2, [%rd1], %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%ret = atomicrmw min ptr %subr, i32 %val seq_cst
ret i32 %ret
}
; CHECK-LABEL: atom15
define i64 @atom15(ptr %subr, i64 %val) {
-; CHECK: atom.min.s64
+; CHECK-LABEL: atom15(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom15_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [atom15_param_1];
+; CHECK-NEXT: atom.min.s64 %rd3, [%rd1], %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%ret = atomicrmw min ptr %subr, i64 %val seq_cst
ret i64 %ret
}
; CHECK-LABEL: atom16
define i32 @atom16(ptr %subr, i32 %val) {
-; CHECK: atom.max.u32
+; CHECK-LABEL: atom16(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom16_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [atom16_param_1];
+; CHECK-NEXT: atom.max.u32 %r2, [%rd1], %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%ret = atomicrmw umax ptr %subr, i32 %val seq_cst
ret i32 %ret
}
; CHECK-LABEL: atom17
define i64 @atom17(ptr %subr, i64 %val) {
-; CHECK: atom.max.u64
+; CHECK-LABEL: atom17(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom17_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [atom17_param_1];
+; CHECK-NEXT: atom.max.u64 %rd3, [%rd1], %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%ret = atomicrmw umax ptr %subr, i64 %val seq_cst
ret i64 %ret
}
; CHECK-LABEL: atom18
define i32 @atom18(ptr %subr, i32 %val) {
-; CHECK: atom.min.u32
+; CHECK-LABEL: atom18(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom18_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [atom18_param_1];
+; CHECK-NEXT: atom.min.u32 %r2, [%rd1], %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%ret = atomicrmw umin ptr %subr, i32 %val seq_cst
ret i32 %ret
}
; CHECK-LABEL: atom19
define i64 @atom19(ptr %subr, i64 %val) {
-; CHECK: atom.min.u64
+; CHECK-LABEL: atom19(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom19_param_0];
+; CHECK-NEXT: ld.param.u64 %rd2, [atom19_param_1];
+; CHECK-NEXT: atom.min.u64 %rd3, [%rd1], %rd2;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%ret = atomicrmw umin ptr %subr, i64 %val seq_cst
ret i64 %ret
}
@@ -145,7 +323,17 @@ declare float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val)
; CHECK-LABEL: atomic_add_f32_generic
define float @atomic_add_f32_generic(ptr %addr, float %val) {
-; CHECK: atom.add.f32
+; CHECK-LABEL: atomic_add_f32_generic(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_generic_param_0];
+; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_generic_param_1];
+; CHECK-NEXT: atom.add.f32 %f2, [%rd1], %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%ret = call float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val)
ret float %ret
}
@@ -154,7 +342,17 @@ declare float @llvm.nvvm.atomic.load.add.f32.p1(ptr addrspace(1) %addr, float %v
; CHECK-LABEL: atomic_add_f32_addrspace1
define float @atomic_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
-; CHECK: atom.global.add.f32
+; CHECK-LABEL: atomic_add_f32_addrspace1(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_addrspace1_param_0];
+; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_addrspace1_param_1];
+; CHECK-NEXT: atom.global.add.f32 %f2, [%rd1], %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%ret = call float @llvm.nvvm.atomic.load.add.f32.p1(ptr addrspace(1) %addr, float %val)
ret float %ret
}
@@ -163,52 +361,152 @@ declare float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %v
; CHECK-LABEL: atomic_add_f32_addrspace3
define float @atomic_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
-; CHECK: atom.shared.add.f32
+; CHECK-LABEL: atomic_add_f32_addrspace3(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_addrspace3_param_0];
+; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_addrspace3_param_1];
+; CHECK-NEXT: atom.shared.add.f32 %f2, [%rd1], %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%ret = call float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %val)
ret float %ret
}
; CHECK-LABEL: atomicrmw_add_f32_generic
define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
-; CHECK: atom.add.f32
+; CHECK-LABEL: atomicrmw_add_f32_generic(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_generic_param_0];
+; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_generic_param_1];
+; CHECK-NEXT: atom.add.f32 %f2, [%rd1], %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%ret = atomicrmw fadd ptr %addr, float %val seq_cst
ret float %ret
}
; CHECK-LABEL: atomicrmw_add_f16_generic
define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
-; CHECK: membar.sys
-; CHECK: atom.cas
+; CHECK-LABEL: atomicrmw_add_f16_generic(
+; CHECK: {
+; CHECK-NEXT: .reg .pred %p<2>;
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<17>;
+; CHECK-NEXT: .reg .f32 %f<4>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b16 %rs1, [atomicrmw_add_f16_generic_param_1];
+; CHECK-NEXT: ld.param.u64 %rd2, [atomicrmw_add_f16_generic_param_0];
+; CHECK-NEXT: and.b64 %rd1, %rd2, -4;
+; CHECK-NEXT: cvt.u32.u64 %r6, %rd2;
+; CHECK-NEXT: and.b32 %r7, %r6, 3;
+; CHECK-NEXT: shl.b32 %r1, %r7, 3;
+; CHECK-NEXT: mov.b32 %r8, 65535;
+; CHECK-NEXT: shl.b32 %r9, %r8, %r1;
+; CHECK-NEXT: not.b32 %r2, %r9;
+; CHECK-NEXT: ld.u32 %r16, [%rd1];
+; CHECK-NEXT: cvt.f32.f16 %f2, %rs1;
+; CHECK-NEXT: $L__BB22_1: // %atomicrmw.start
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: shr.u32 %r10, %r16, %r1;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r10;
+; CHECK-NEXT: cvt.f32.f16 %f1, %rs2;
+; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2;
+; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f3;
+; CHECK-NEXT: cvt.u32.u16 %r11, %rs3;
+; CHECK-NEXT: shl.b32 %r12, %r11, %r1;
+; CHECK-NEXT: and.b32 %r13, %r16, %r2;
+; CHECK-NEXT: or.b32 %r14, %r13, %r12;
+; CHECK-NEXT: membar.sys;
+; CHECK-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r14;
+; CHECK-NEXT: setp.ne.s32 %p1, %r5, %r16;
+; CHECK-NEXT: mov.u32 %r16, %r5;
+; CHECK-NEXT: @%p1 bra $L__BB22_1;
+; CHECK-NEXT: // %bb.2: // %atomicrmw.end
+; CHECK-NEXT: shr.u32 %r15, %r5, %r1;
+; CHECK-NEXT: cvt.u16.u32 %rs4, %r15;
+; CHECK-NEXT: st.param.b16 [func_retval0], %rs4;
+; CHECK-NEXT: ret;
%ret = atomicrmw fadd ptr %addr, half %val seq_cst
ret half %ret
}
; CHECK-LABEL: atomicrmw_add_f32_addrspace1
define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
-; CHECK: atom.global.add.f32
+; CHECK-LABEL: atomicrmw_add_f32_addrspace1(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace1_param_0];
+; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_addrspace1_param_1];
+; CHECK-NEXT: atom.global.add.f32 %f2, [%rd1], %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%ret = atomicrmw fadd ptr addrspace(1) %addr, float %val seq_cst
ret float %ret
}
; CHECK-LABEL: atomicrmw_add_f32_addrspace3
define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
-; CHECK: atom.shared.add.f32
+; CHECK-LABEL: atomicrmw_add_f32_addrspace3(
+; CHECK: {
+; CHECK-NEXT: .reg .f32 %f<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace3_param_0];
+; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_addrspace3_param_1];
+; CHECK-NEXT: atom.shared.add.f32 %f2, [%rd1], %f1;
+; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
+; CHECK-NEXT: ret;
%ret = atomicrmw fadd ptr addrspace(3) %addr, float %val seq_cst
ret float %ret
}
; CHECK-LABEL: atomic_cmpxchg_i32
define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) {
-; CHECK: membar.sys
-; CHECK: atom.cas.b32
+; CHECK-LABEL: atomic_cmpxchg_i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atomic_cmpxchg_i32_param_0];
+; CHECK-NEXT: membar.sys;
+; CHECK-NEXT: ld.param.u32 %r1, [atomic_cmpxchg_i32_param_1];
+; CHECK-NEXT: ld.param.u32 %r2, [atomic_cmpxchg_i32_param_2];
+; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
%pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
ret i32 %new
}
; CHECK-LABEL: atomic_cmpxchg_i64
define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) {
-; CHECK: membar.sys
-; CHECK: atom.cas.b64
+; CHECK-LABEL: atomic_cmpxchg_i64(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atomic_cmpxchg_i64_param_0];
+; CHECK-NEXT: membar.sys;
+; CHECK-NEXT: ld.param.u64 %rd2, [atomic_cmpxchg_i64_param_1];
+; CHECK-NEXT: ld.param.u64 %rd3, [atomic_cmpxchg_i64_param_2];
+; CHECK-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
+; CHECK-NEXT: ret;
%pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
ret i64 %new
}
More information about the llvm-commits
mailing list