[llvm] Reland "[NVPTX][AtomicExpandPass] Complete support for AtomicRMW in NVPTX" (PR #179376)
Akshay Deodhar via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 2 18:51:39 PST 2026
https://github.com/akshayrdeodhar created https://github.com/llvm/llvm-project/pull/179376
This PR adds full support for atomicrmw in NVPTX. This includes:
- Memory order and syncscope support (changes in AtomicExpandPass.cpp, NVPTXIntrinsics.td)
- Script-generated tests for integer and atomic operations (atomicrmw.py, atomicrmw-sm*.ll in tests/CodeGen/NVPTX). Existing atomics tests which are subsumed by these have been removed (atomics-sm*.ll, atomics.ll, atomicrmw-expand.ll).
- Changes shouldExpandAtomicRMWInIR to take a constant argument: This is to allow some other TargetLowering constant-argument functions to call it. This change touches several backends. An alternative solution exists, but to me, this seems the "right" way. Has been split out into [NFC][TargetLowering] Make shouldExpandAtomicRMWInIR and shouldExpandAtomicCmpXchgInIR take a const Instruction pointer #176073. Rebased.
- NOTE: The initial load issued for atomicrmw emulation loops (and cmpxchg emulation loops) must be a strong load. Currently, AtomicExpandPass issues a weak load. Fixing this breaks several backends. I'm planning to follow up with a separate PR.
Initially failed due to error: `ptxas fatal : Value 'sm_60' is not defined for option 'gpu-name'`. Updated RUN lines in atomicrmw-sm*.py to skip the ptxas-verify check if ptxas does not support that SM version.
>From f89e74a6a61649addc794aa7554a332e2f0983e0 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Mon, 5 Jan 2026 19:59:20 +0000
Subject: [PATCH 1/2] [NVPTX][AtomicExpandPass] Complete support for AtomicRMW
in NVPTX
---
llvm/lib/CodeGen/AtomicExpandPass.cpp | 20 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 96 +-
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 115 +-
llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll | 151 -
llvm/test/CodeGen/NVPTX/atomicrmw-sm60.ll | 3137 +++++++++++++++++
llvm/test/CodeGen/NVPTX/atomicrmw-sm70.ll | 3111 ++++++++++++++++
llvm/test/CodeGen/NVPTX/atomicrmw-sm90.ll | 2983 ++++++++++++++++
llvm/test/CodeGen/NVPTX/atomicrmw.py | 120 +
llvm/test/CodeGen/NVPTX/atomics-sm60.ll | 17 +-
llvm/test/CodeGen/NVPTX/atomics-sm70.ll | 144 -
llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 147 -
llvm/test/CodeGen/NVPTX/atomics.ll | 479 +--
.../NVPTX/distributed-shared-cluster.ll | 52 +-
llvm/test/CodeGen/NVPTX/lit.local.cfg | 2 +-
14 files changed, 9537 insertions(+), 1037 deletions(-)
delete mode 100644 llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll
create mode 100644 llvm/test/CodeGen/NVPTX/atomicrmw-sm60.ll
create mode 100644 llvm/test/CodeGen/NVPTX/atomicrmw-sm70.ll
create mode 100644 llvm/test/CodeGen/NVPTX/atomicrmw-sm90.ll
create mode 100644 llvm/test/CodeGen/NVPTX/atomicrmw.py
delete mode 100644 llvm/test/CodeGen/NVPTX/atomics-sm70.ll
delete mode 100644 llvm/test/CodeGen/NVPTX/atomics-sm90.ll
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 3d2a52dab153e..e7195fa472754 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -107,7 +107,7 @@ class AtomicExpandImpl {
AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
static Value *insertRMWCmpXchgLoop(
IRBuilderBase &Builder, Type *ResultType, Value *Addr, Align AddrAlign,
- AtomicOrdering MemOpOrder, SyncScope::ID SSID,
+ AtomicOrdering MemOpOrder, SyncScope::ID SSID, bool IsVolatile,
function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc);
bool tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI);
@@ -331,7 +331,7 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
} else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
isAcquireOrStronger(RMWI->getOrdering()))) {
FenceOrdering = RMWI->getOrdering();
- RMWI->setOrdering(AtomicOrdering::Monotonic);
+ RMWI->setOrdering(TLI->atomicOperationOrderAfterFenceSplit(RMWI));
} else if (CASI &&
TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==
TargetLoweringBase::AtomicExpansionKind::None &&
@@ -1002,6 +1002,7 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
}
AtomicOrdering MemOpOrder = AI->getOrdering();
SyncScope::ID SSID = AI->getSyncScopeID();
+ bool IsVolatile = AI->isVolatile();
ReplacementIRBuilder Builder(AI, *DL);
@@ -1025,9 +1026,10 @@ void AtomicExpandImpl::expandPartwordAtomicRMW(
Value *OldResult;
if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
- OldResult = insertRMWCmpXchgLoop(
- Builder, PMV.WordType, PMV.AlignedAddr, PMV.AlignedAddrAlignment,
- MemOpOrder, SSID, PerformPartwordOp, createCmpXchgInstFun, AI);
+ OldResult = insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr,
+ PMV.AlignedAddrAlignment, MemOpOrder, SSID,
+ IsVolatile, PerformPartwordOp,
+ createCmpXchgInstFun, AI);
} else {
assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr,
@@ -1649,7 +1651,7 @@ bool AtomicExpandImpl::simplifyIdempotentRMW(AtomicRMWInst *RMWI) {
Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
- AtomicOrdering MemOpOrder, SyncScope::ID SSID,
+ AtomicOrdering MemOpOrder, SyncScope::ID SSID, bool IsVolatile,
function_ref<Value *(IRBuilderBase &, Value *)> PerformOp,
CreateCmpXchgInstFun CreateCmpXchg, Instruction *MetadataSrc) {
LLVMContext &Ctx = Builder.getContext();
@@ -1681,6 +1683,10 @@ Value *AtomicExpandImpl::insertRMWCmpXchgLoop(
std::prev(BB->end())->eraseFromParent();
Builder.SetInsertPoint(BB);
LoadInst *InitLoaded = Builder.CreateAlignedLoad(ResultTy, Addr, AddrAlign);
+ // TODO: The initial load must be strong to avoid a data race with concurrent
+ // stores. Issue a strong load with the same synchronization scope as the
+ // atomicrmw instruction here.
+ InitLoaded->setVolatile(IsVolatile);
Builder.CreateBr(LoopBB);
// Start the main loop block now that we've taken care of the preliminaries.
@@ -1750,7 +1756,7 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
// loop for the FP atomics.
Value *Loaded = AtomicExpandImpl::insertRMWCmpXchgLoop(
Builder, AI->getType(), AI->getPointerOperand(), AI->getAlign(),
- AI->getOrdering(), AI->getSyncScopeID(),
+ AI->getOrdering(), AI->getSyncScopeID(), AI->isVolatile(),
[&](IRBuilderBase &Builder, Value *Loaded) {
return buildAtomicRMWValue(AI->getOperation(), Builder, Loaded,
AI->getValOperand());
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 02b2b217aff51..be17f1cdbf277 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -7436,40 +7436,70 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const {
bool NVPTXTargetLowering::shouldInsertFencesForAtomic(
const Instruction *I) const {
auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
+ auto *RI = dyn_cast<AtomicRMWInst>(I);
// When CAS bitwidth is not supported on the hardware, the CAS is emulated
- // using a retry loop that uses a higher-bitwidth monotonic CAS. We enforce
- // the memory order using explicit fences around the retry loop.
- // The memory order of natively supported CAS operations can be enforced
- // by lowering to an atom.cas with the right memory synchronizing effect.
- // However, atom.cas only supports relaxed, acquire, release and acq_rel.
- // So we also use explicit fences for enforcing memory order for
- // seq_cast CAS with natively-supported bitwidths.
- return CI &&
- (cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
- STI.getMinCmpXchgSizeInBits() ||
- CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent);
+ // using a retry loop that uses a higher-bitwidth monotonic CAS. Similarly, if
+ // the atomicrmw operation is not supported on hardware, we emulate it with a
+ // cmpxchg loop. In such cases, we enforce the memory order using explicit
+ // fences around the retry loop.
+ // The memory order of natively supported CAS or RMW operations can be
+ // enforced by lowering to an `atom.<op>` instr with the right memory
+ // synchronization effect. However, atom only supports relaxed, acquire,
+ // release and acq_rel. So we also use explicit fences to enforce memory
+ // order in seq_cast CAS or RMW instructions that can be lowered as acq_rel.
+ if (CI)
+ return (cast<IntegerType>(CI->getCompareOperand()->getType())
+ ->getBitWidth() < STI.getMinCmpXchgSizeInBits()) ||
+ CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent;
+ if (RI)
+ return shouldExpandAtomicRMWInIR(RI) == AtomicExpansionKind::CmpXChg ||
+ RI->getOrdering() == AtomicOrdering::SequentiallyConsistent;
+ return false;
}
AtomicOrdering NVPTXTargetLowering::atomicOperationOrderAfterFenceSplit(
const Instruction *I) const {
- auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
- bool BitwidthSupportedAndIsSeqCst =
- CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
+ // Only lower to atom.<op>.acquire if the operation is not emulated, and its
+ // ordering is seq_cst. This produces a sequence of the form:
+ // fence.sc
+ // atom.<op>.acquire
+ // Instead of
+ // fence.sc
+ // atom.<op>
+ // fence.acquire
+ // The two-instruction sequence is weaker than the alternative, but guarantees
+ // seq_cst ordering.
+ //
+ // In all other cases, lower to atom.<op>.relaxed
+ const auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
+ const auto *RI = dyn_cast<AtomicRMWInst>(I);
+ AtomicOrdering Ordering;
+ if (CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
- STI.getMinCmpXchgSizeInBits();
- return BitwidthSupportedAndIsSeqCst ? AtomicOrdering::Acquire
- : AtomicOrdering::Monotonic;
+ STI.getMinCmpXchgSizeInBits())
+ Ordering = AtomicOrdering::Acquire;
+ else if (RI && RI->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
+ shouldExpandAtomicRMWInIR(RI) == AtomicExpansionKind::None)
+ Ordering = AtomicOrdering::Acquire;
+ else
+ Ordering = AtomicOrdering::Monotonic;
+ return Ordering;
}
+// prerequisites: shouldInsertFencesForAtomic() returns true for Inst
Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
- if (!isa<AtomicCmpXchgInst>(Inst))
+ auto IsCmpXchg = isa<AtomicCmpXchgInst>(Inst);
+ auto IsRMW = isa<AtomicRMWInst>(Inst);
+ if (!IsCmpXchg && !IsRMW)
return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
- // Specialize for cmpxchg
+ // Specialize for cmpxchg and rmw
// Emit a fence.sc leading fence for cmpxchg seq_cst which are not emulated
- SyncScope::ID SSID = cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID();
+ SyncScope::ID SSID = IsCmpXchg
+ ? cast<AtomicCmpXchgInst>(Inst)->getSyncScopeID()
+ : cast<AtomicRMWInst>(Inst)->getSyncScopeID();
if (isReleaseOrStronger(Ord))
return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
? Ord
@@ -7479,21 +7509,27 @@ Instruction *NVPTXTargetLowering::emitLeadingFence(IRBuilderBase &Builder,
return nullptr;
}
+// prerequisites: shouldInsertFencesForAtomic() returns true for Inst
Instruction *NVPTXTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
Instruction *Inst,
AtomicOrdering Ord) const {
- // Specialize for cmpxchg
- if (!isa<AtomicCmpXchgInst>(Inst))
+ // Specialize for cmpxchg and rmw
+ auto *CI = dyn_cast<AtomicCmpXchgInst>(Inst);
+ auto *RI = dyn_cast<AtomicRMWInst>(Inst);
+ if (!CI && !RI)
return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
- auto *CI = cast<AtomicCmpXchgInst>(Inst);
- auto CASWidth =
- cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth();
- SyncScope::ID SSID = CI->getSyncScopeID();
- // Do not emit a trailing fence for cmpxchg seq_cst which are not emulated
- if (isAcquireOrStronger(Ord) &&
- (Ord != AtomicOrdering::SequentiallyConsistent ||
- CASWidth < STI.getMinCmpXchgSizeInBits()))
+ SyncScope::ID SSID = CI ? CI->getSyncScopeID() : RI->getSyncScopeID();
+
+ bool IsEmulated = false;
+ if (CI)
+ IsEmulated =
+ cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() <
+ STI.getMinCmpXchgSizeInBits();
+ else if (RI)
+ IsEmulated = shouldExpandAtomicRMWInIR(RI) == AtomicExpansionKind::CmpXChg;
+
+ if (isAcquireOrStronger(Ord) && IsEmulated)
return Builder.CreateFence(AtomicOrdering::Acquire, SSID);
return nullptr;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index ad5dd356ee90f..5e13f7ff1efe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2372,16 +2372,7 @@ def INT_FNS_iii : INT_FNS_MBO<(ins i32imm:$mask, i32imm:$base, i32imm:$
// Atomic Functions
//-----------------------------------
-class ATOMIC_GLOBAL_CHK <dag frag>
- : PatFrag<!setdagop(frag, ops), frag, AS_match.global>;
-class ATOMIC_SHARED_CHK <dag frag>
- : PatFrag<!setdagop(frag, ops), frag, AS_match.shared>;
-class ATOMIC_SHARED_CLUSTER_CHK <dag frag>
- : PatFrag<!setdagop(frag, ops), frag, AS_match.shared_cluster>;
-class ATOMIC_GENERIC_CHK <dag frag>
- : PatFrag<!setdagop(frag, ops), frag, AS_match.generic>;
-
-multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
+multiclass F_ATOMIC_2_INTRINSIC<RegTyInfo t, string sem_str, string as_str, string op_str,
SDPatternOperator op, list<Predicate> preds> {
defvar asm_str = "atom" # sem_str # as_str # "." # op_str;
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
@@ -2397,6 +2388,42 @@ multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
}
}
+multiclass F_ATOMIC_2<RegTyInfo t, SDPatternOperator op, string op_str, SDNode atomic, list<Predicate> preds = []> {
+ defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str;
+ let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
+ def _r : BasicFlagsNVPTXInst<(outs t.RC:$dst),
+ (ins ADDR:$addr, t.RC:$b),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ asm_str>,
+ Requires<preds>;
+ if t.SupportsImm then
+ def _i : BasicFlagsNVPTXInst<(outs t.RC:$dst),
+ (ins ADDR:$addr, t.Imm:$b),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp),
+ asm_str>,
+ Requires<preds>;
+ }
+
+ defvar GetSem = SDNodeXForm<atomic, [{
+ return getI32Imm(getMemOrder(cast<MemSDNode>(N)), SDLoc(N));
+ }]>;
+
+ defvar GetScope = SDNodeXForm<atomic, [{
+ return getI32Imm(getAtomicScope(cast<MemSDNode>(N)), SDLoc(N));
+ }]>;
+
+ defvar GetAddSp = SDNodeXForm<atomic, [{
+ return getI32Imm(getAddrSpace(cast<MemSDNode>(N)), SDLoc(N));
+ }]>;
+
+ def : Pat<(op:$this addr:$addr, t.Ty:$b),
+ (!cast<Instruction>(NAME # _r) ADDR:$addr, t.Ty:$b, (GetSem $this), (GetScope $this), (GetAddSp $this))>;
+
+ if t.SupportsImm then
+ def : Pat<(op:$this addr:$addr, (t.Ty t.ImmNode:$b)),
+ (!cast<Instruction>(NAME # _i) ADDR:$addr, (t.Ty t.ImmNode:$b), (GetSem $this), (GetScope $this), (GetAddSp $this))>;
+}
+
multiclass F_ATOMIC_3<RegTyInfo t, string op_str, SDPatternOperator op, SDNode atomic> {
defvar asm_str = "atom${sem:sem}${scope:scope}${addsp:addsp}" # op_str;
@@ -2447,55 +2474,57 @@ multiclass F_ATOMIC_3<RegTyInfo t, string op_str, SDPatternOperator op, SDNode a
(!cast<Instruction>(NAME # _ii) ADDR:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c), (GetSem $this), (GetScope $this), (GetAddSp $this))>;
}
-multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, list<Predicate> preds = []> {
- defvar frag_pat = (frag node:$a, node:$b);
- defm _G : F_ATOMIC_2<t, "", ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
- defm _S : F_ATOMIC_2<t, "", ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
- defm _S_C : F_ATOMIC_2<t, "", ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
- defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
-}
+defm atomic_load_fadd : binary_atomic_op_fp<atomic_load_fadd>;
// atom_add
-defm INT_PTX_ATOM_ADD_32 : F_ATOMIC_2_AS<I32RT, atomic_load_add_i32, "add.u32">;
-defm INT_PTX_ATOM_ADD_64 : F_ATOMIC_2_AS<I64RT, atomic_load_add_i64, "add.u64">;
+defm INT_PTX_ATOM_ADD_32 : F_ATOMIC_2<I32RT, atomic_load_add_i32, ".add.u32", atomic_load_add>;
+defm INT_PTX_ATOM_ADD_64 : F_ATOMIC_2<I64RT, atomic_load_add_i64, ".add.u64", atomic_load_add>;
-defm INT_PTX_ATOM_ADD_F16 : F_ATOMIC_2_AS<F16RT, atomic_load_fadd, "add.noftz.f16", [hasSM<70>, hasPTX<63>]>;
-defm INT_PTX_ATOM_ADD_BF16 : F_ATOMIC_2_AS<BF16RT, atomic_load_fadd, "add.noftz.bf16", [hasSM<90>, hasPTX<78>]>;
-defm INT_PTX_ATOM_ADD_F32 : F_ATOMIC_2_AS<F32RT, atomic_load_fadd, "add.f32">;
-defm INT_PTX_ATOM_ADD_F64 : F_ATOMIC_2_AS<F64RT, atomic_load_fadd, "add.f64", [hasAtomAddF64]>;
+defm INT_PTX_ATOM_ADD_F16 : F_ATOMIC_2<F16RT, atomic_load_fadd, ".add.noftz.f16", atomic_load_fadd, [hasSM<70>, hasPTX<63>]>;
+defm INT_PTX_ATOM_ADD_BF16 : F_ATOMIC_2<BF16RT, atomic_load_fadd, ".add.noftz.bf16", atomic_load_fadd, [hasSM<90>, hasPTX<78>]>;
+defm INT_PTX_ATOM_ADD_F32 : F_ATOMIC_2<F32RT, atomic_load_fadd, ".add.f32", atomic_load_fadd>;
+defm INT_PTX_ATOM_ADD_F64 : F_ATOMIC_2<F64RT, atomic_load_fadd, ".add.f64", atomic_load_fadd, [hasAtomAddF64]>;
// atom_swap
-defm INT_PTX_ATOM_SWAP_32 : F_ATOMIC_2_AS<I32RT, atomic_swap_i32, "exch.b32">;
-defm INT_PTX_ATOM_SWAP_64 : F_ATOMIC_2_AS<I64RT, atomic_swap_i64, "exch.b64">;
+defm INT_PTX_ATOM_SWAP_32 : F_ATOMIC_2<I32RT, atomic_swap_i32, ".exch.b32", atomic_swap>;
+defm INT_PTX_ATOM_SWAP_64 : F_ATOMIC_2<I64RT, atomic_swap_i64, ".exch.b64", atomic_swap>;
// atom_max
-defm INT_PTX_ATOMIC_MAX_32 : F_ATOMIC_2_AS<I32RT, atomic_load_max_i32, "max.s32">;
-defm INT_PTX_ATOMIC_MAX_64 : F_ATOMIC_2_AS<I64RT, atomic_load_max_i64, "max.s64", [hasSM<32>]>;
-defm INT_PTX_ATOMIC_UMAX_32 : F_ATOMIC_2_AS<I32RT, atomic_load_umax_i32, "max.u32">;
-defm INT_PTX_ATOMIC_UMAX_64 : F_ATOMIC_2_AS<I64RT, atomic_load_umax_i64, "max.u64", [hasSM<32>]>;
+defm INT_PTX_ATOMIC_MAX_32 : F_ATOMIC_2<I32RT, atomic_load_max_i32, ".max.s32", atomic_load_max>;
+defm INT_PTX_ATOMIC_MAX_64 : F_ATOMIC_2<I64RT, atomic_load_max_i64, ".max.s64", atomic_load_max, [hasSM<32>]>;
+defm INT_PTX_ATOMIC_UMAX_32 : F_ATOMIC_2<I32RT, atomic_load_umax_i32, ".max.u32", atomic_load_umax>;
+defm INT_PTX_ATOMIC_UMAX_64 : F_ATOMIC_2<I64RT, atomic_load_umax_i64, ".max.u64", atomic_load_umax, [hasSM<32>]>;
// atom_min
-defm INT_PTX_ATOMIC_MIN_32 : F_ATOMIC_2_AS<I32RT, atomic_load_min_i32, "min.s32">;
-defm INT_PTX_ATOMIC_MIN_64 : F_ATOMIC_2_AS<I64RT, atomic_load_min_i64, "min.s64", [hasSM<32>]>;
-defm INT_PTX_ATOMIC_UMIN_32 : F_ATOMIC_2_AS<I32RT, atomic_load_umin_i32, "min.u32">;
-defm INT_PTX_ATOMIC_UMIN_64 : F_ATOMIC_2_AS<I64RT, atomic_load_umin_i64, "min.u64", [hasSM<32>]>;
+defm INT_PTX_ATOMIC_MIN_32 : F_ATOMIC_2<I32RT, atomic_load_min_i32, ".min.s32", atomic_load_min>;
+defm INT_PTX_ATOMIC_MIN_64 : F_ATOMIC_2<I64RT, atomic_load_min_i64, ".min.s64", atomic_load_min, [hasSM<32>]>;
+defm INT_PTX_ATOMIC_UMIN_32 : F_ATOMIC_2<I32RT, atomic_load_umin_i32, ".min.u32", atomic_load_umin>;
+defm INT_PTX_ATOMIC_UMIN_64 : F_ATOMIC_2<I64RT, atomic_load_umin_i64, ".min.u64", atomic_load_umin, [hasSM<32>]>;
+
+// NOTE: The semantics for atomicrmw fmin (aka llvm.minnum)[1] are *slightly*
+// different from the ones from the ones for atom.min.f32[2]. However, LLVM
+// LangRef specifies that sNaNs can be treated as qNaNs. This makes the two
+// operations equivalent- as the only difference in behavior is that min(sNaN,
+// NUM) -> qNaN in LLVM, and atom.min returns min(NaN, NUM) -> NUM for all NaNs.
+// [1] https://llvm.org/docs/LangRef.html#llvm-implementation
+// [2] https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#half-precision-floating-point-instructions-min
+// [3] https://llvm.org/docs/LangRef.html#behavior-of-floating-point-nan-values
// atom_inc atom_dec
-defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS<I32RT, atomic_load_uinc_wrap_i32, "inc.u32">;
-defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS<I32RT, atomic_load_udec_wrap_i32, "dec.u32">;
+defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2<I32RT, atomic_load_uinc_wrap_i32, ".inc.u32", atomic_load_uinc_wrap>;
+defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2<I32RT, atomic_load_udec_wrap_i32, ".dec.u32", atomic_load_udec_wrap>;
// atom_and
-defm INT_PTX_ATOM_AND_32 : F_ATOMIC_2_AS<I32RT, atomic_load_and_i32, "and.b32">;
-defm INT_PTX_ATOM_AND_64 : F_ATOMIC_2_AS<I64RT, atomic_load_and_i64, "and.b64", [hasSM<32>]>;
+defm INT_PTX_ATOM_AND_32 : F_ATOMIC_2<I32RT, atomic_load_and_i32, ".and.b32", atomic_load_and>;
+defm INT_PTX_ATOM_AND_64 : F_ATOMIC_2<I64RT, atomic_load_and_i64, ".and.b64", atomic_load_and, [hasSM<32>]>;
// atom_or
-defm INT_PTX_ATOM_OR_32 : F_ATOMIC_2_AS<I32RT, atomic_load_or_i32, "or.b32">;
-defm INT_PTX_ATOM_OR_64 : F_ATOMIC_2_AS<I64RT, atomic_load_or_i64, "or.b64", [hasSM<32>]>;
+defm INT_PTX_ATOM_OR_32 : F_ATOMIC_2<I32RT, atomic_load_or_i32, ".or.b32", atomic_load_or>;
+defm INT_PTX_ATOM_OR_64 : F_ATOMIC_2<I64RT, atomic_load_or_i64, ".or.b64", atomic_load_or, [hasSM<32>]>;
// atom_xor
-defm INT_PTX_ATOM_XOR_32 : F_ATOMIC_2_AS<I32RT, atomic_load_xor_i32, "xor.b32">;
-defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2_AS<I64RT, atomic_load_xor_i64, "xor.b64", [hasSM<32>]>;
-
+defm INT_PTX_ATOM_XOR_32 : F_ATOMIC_2<I32RT, atomic_load_xor_i32, ".xor.b32", atomic_load_xor>;
+defm INT_PTX_ATOM_XOR_64 : F_ATOMIC_2<I64RT, atomic_load_xor_i64, ".xor.b64", atomic_load_xor, [hasSM<32>]>;
// Define atom.cas for all combinations of size x addrspace x memory order
// supported in PTX *and* on the hardware.
@@ -2518,7 +2547,7 @@ foreach t = [I16RT, I32RT, I64RT] in {
multiclass ATOM2N_impl<string OpStr, string IntTypeStr, string TypeStr,
string ScopeStr, string SpaceStr,
RegTyInfo t, list<Predicate> Preds> {
- defm "" : F_ATOMIC_2<t,
+ defm "" : F_ATOMIC_2_INTRINSIC<t,
as_str = !if(!eq(ScopeStr, "gpu"), "", "." # ScopeStr),
sem_str = !if(!eq(SpaceStr, "gen"), "", "." # SpaceStr),
op_str = OpStr # "." # TypeStr,
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll
deleted file mode 100644
index 88fae7a3f78a0..0000000000000
--- a/llvm/test/CodeGen/NVPTX/atomicrmw-expand.ll
+++ /dev/null
@@ -1,151 +0,0 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefixes=ALL,SM30
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 | FileCheck %s --check-prefixes=ALL,SM60
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %}
-; RUN: %if ptxas-sm_60 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_60 | %ptxas-verify -arch=sm_60 %}
-
-; CHECK-LABEL: fadd_double
-define void @fadd_double(ptr %0, double %1) {
-entry:
- ; SM30: atom.cas.b64
- ; SM60: atom.add.f64
- %2 = atomicrmw fadd ptr %0, double %1 monotonic, align 8
- ret void
-}
-
-; CHECK-LABEL: fadd_float
-define void @fadd_float(ptr %0, float %1) {
-entry:
- ; ALL: atom.add.f32
- %2 = atomicrmw fadd ptr %0, float %1 monotonic, align 4
- ret void
-}
-
-; CHECK-LABEL: bitwise_i32
-define void @bitwise_i32(ptr %0, i32 %1) {
-entry:
- ; ALL: atom.and.b32
- %2 = atomicrmw and ptr %0, i32 %1 monotonic, align 4
- ; ALL: atom.or.b32
- %3 = atomicrmw or ptr %0, i32 %1 monotonic, align 4
- ; ALL: atom.xor.b32
- %4 = atomicrmw xor ptr %0, i32 %1 monotonic, align 4
- ; ALL: atom.exch.b32
- %5 = atomicrmw xchg ptr %0, i32 %1 monotonic, align 4
- ret void
-}
-
-; CHECK-LABEL: bitwise_i64
-define void @bitwise_i64(ptr %0, i64 %1) {
-entry:
- ; SM30: atom.cas.b64
- ; SM60: atom.and.b64
- %2 = atomicrmw and ptr %0, i64 %1 monotonic, align 8
- ; SM30: atom.cas.b64
- ; SM60: atom.or.b64
- %3 = atomicrmw or ptr %0, i64 %1 monotonic, align 8
- ; SM30: atom.cas.b64
- ; SM60: atom.xor.b64
- %4 = atomicrmw xor ptr %0, i64 %1 monotonic, align 8
- ; SM30: atom.cas.b64
- ; SM60: atom.exch.b64
- %5 = atomicrmw xchg ptr %0, i64 %1 monotonic, align 8
- ret void
-}
-
-; CHECK-LABEL: minmax_i32
-define void @minmax_i32(ptr %0, i32 %1) {
-entry:
- ; ALL: atom.min.s32
- %2 = atomicrmw min ptr %0, i32 %1 monotonic, align 4
- ; ALL: atom.max.s32
- %3 = atomicrmw max ptr %0, i32 %1 monotonic, align 4
- ; ALL: atom.min.u32
- %4 = atomicrmw umin ptr %0, i32 %1 monotonic, align 4
- ; ALL: atom.max.u32
- %5 = atomicrmw umax ptr %0, i32 %1 monotonic, align 4
- ret void
-}
-
-; CHECK-LABEL: minmax_i64
-define void @minmax_i64(ptr %0, i64 %1) {
-entry:
- ; SM30: atom.cas.b64
- ; SM60: atom.min.s64
- %2 = atomicrmw min ptr %0, i64 %1 monotonic, align 8
- ; SM30: atom.cas.b64
- ; SM60: atom.max.s64
- %3 = atomicrmw max ptr %0, i64 %1 monotonic, align 8
- ; SM30: atom.cas.b64
- ; SM60: atom.min.u64
- %4 = atomicrmw umin ptr %0, i64 %1 monotonic, align 8
- ; SM30: atom.cas.b64
- ; SM60: atom.max.u64
- %5 = atomicrmw umax ptr %0, i64 %1 monotonic, align 8
- ret void
-}
-
-; CHECK-LABEL: bitwise_i8
-define void @bitwise_i8(ptr %0, i8 %1) {
-entry:
- ; ALL: atom.and.b32
- %2 = atomicrmw and ptr %0, i8 %1 monotonic, align 1
- ; ALL: atom.or.b32
- %3 = atomicrmw or ptr %0, i8 %1 monotonic, align 1
- ; ALL: atom.xor.b32
- %4 = atomicrmw xor ptr %0, i8 %1 monotonic, align 1
- ; SM30: atom.cas.b32
- ; SM60: atom.sys.cas.b32
- %5 = atomicrmw xchg ptr %0, i8 %1 monotonic, align 1
- ret void
-}
-
-; CHECK-LABEL: minmax_i8
-define void @minmax_i8(ptr %0, i8 %1) {
-entry:
- ; SM30: atom.cas.b32
- ; SM60: atom.sys.cas.b32
- %2 = atomicrmw min ptr %0, i8 %1 monotonic, align 1
- ; SM30: atom.cas.b32
- ; SM60: atom.sys.cas.b32
- %3 = atomicrmw max ptr %0, i8 %1 monotonic, align 1
- ; SM30: atom.cas.b32
- ; SM60: atom.sys.cas.b32
- %4 = atomicrmw umin ptr %0, i8 %1 monotonic, align 1
- ; SM30: atom.cas.b32
- ; SM60: atom.sys.cas.b32
- %5 = atomicrmw umax ptr %0, i8 %1 monotonic, align 1
- ret void
-}
-
-; CHECK-LABEL: bitwise_i16
-define void @bitwise_i16(ptr %0, i16 %1) {
-entry:
- ; ALL: atom.and.b32
- %2 = atomicrmw and ptr %0, i16 %1 monotonic, align 2
- ; ALL: atom.or.b32
- %3 = atomicrmw or ptr %0, i16 %1 monotonic, align 2
- ; ALL: atom.xor.b32
- %4 = atomicrmw xor ptr %0, i16 %1 monotonic, align 2
- ; SM30: atom.cas.b32
- ; SM60: atom.sys.cas.b32
- %5 = atomicrmw xchg ptr %0, i16 %1 monotonic, align 2
- ret void
-}
-
-; CHECK-LABEL: minmax_i16
-define void @minmax_i16(ptr %0, i16 %1) {
-entry:
- ; SM30: atom.cas.b32
- ; SM60: atom.sys.cas.b32
- %2 = atomicrmw min ptr %0, i16 %1 monotonic, align 2
- ; SM30: atom.cas.b32
- ; SM60: atom.sys.cas.b32
- %3 = atomicrmw max ptr %0, i16 %1 monotonic, align 2
- ; SM30: atom.cas.b32
- ; SM60: atom.sys.cas.b32
- %4 = atomicrmw umin ptr %0, i16 %1 monotonic, align 2
- ; SM30: atom.cas.b32
- ; SM60: atom.sys.cas.b32
- %5 = atomicrmw umax ptr %0, i16 %1 monotonic, align 2
- ret void
-}
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-sm60.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-sm60.ll
new file mode 100644
index 0000000000000..49d8701f25e46
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/atomicrmw-sm60.ll
@@ -0,0 +1,3137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %}
+
+define i8 @xchg_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: xchg_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<14>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b8 %r5, [xchg_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r6, %rd2;
+; SM60-NEXT: and.b32 %r7, %r6, 3;
+; SM60-NEXT: shl.b32 %r1, %r7, 3;
+; SM60-NEXT: mov.b32 %r8, 255;
+; SM60-NEXT: shl.b32 %r9, %r8, %r1;
+; SM60-NEXT: not.b32 %r2, %r9;
+; SM60-NEXT: shl.b32 %r3, %r5, %r1;
+; SM60-NEXT: ld.global.b32 %r13, [%rd1];
+; SM60-NEXT: $L__BB0_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r10, %r13, %r2;
+; SM60-NEXT: or.b32 %r11, %r10, %r3;
+; SM60-NEXT: atom.cta.global.cas.b32 %r4, [%rd1], %r13, %r11;
+; SM60-NEXT: setp.ne.b32 %p1, %r4, %r13;
+; SM60-NEXT: mov.b32 %r13, %r4;
+; SM60-NEXT: @%p1 bra $L__BB0_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r12, %r4, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r12;
+; SM60-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @xchg_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: xchg_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<14>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b16 %r5, [xchg_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r6, %rd2;
+; SM60-NEXT: and.b32 %r7, %r6, 3;
+; SM60-NEXT: shl.b32 %r1, %r7, 3;
+; SM60-NEXT: mov.b32 %r8, 65535;
+; SM60-NEXT: shl.b32 %r9, %r8, %r1;
+; SM60-NEXT: not.b32 %r2, %r9;
+; SM60-NEXT: shl.b32 %r3, %r5, %r1;
+; SM60-NEXT: ld.global.b32 %r13, [%rd1];
+; SM60-NEXT: $L__BB1_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r10, %r13, %r2;
+; SM60-NEXT: or.b32 %r11, %r10, %r3;
+; SM60-NEXT: atom.cta.global.cas.b32 %r4, [%rd1], %r13, %r11;
+; SM60-NEXT: setp.ne.b32 %p1, %r4, %r13;
+; SM60-NEXT: mov.b32 %r13, %r4;
+; SM60-NEXT: @%p1 bra $L__BB1_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r12, %r4, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r12;
+; SM60-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @xchg_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: xchg_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [xchg_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [xchg_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.exch.b32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @xchg_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: xchg_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [xchg_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.exch.b64 %rd3, [%rd1], %rd2;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @add_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: add_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [add_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b8 %r6, [add_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 255;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: $L__BB4_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: add.s32 %r10, %r15, %r4;
+; SM60-NEXT: and.b32 %r11, %r10, %r2;
+; SM60-NEXT: and.b32 %r12, %r15, %r3;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM60-NEXT: mov.b32 %r15, %r5;
+; SM60-NEXT: @%p1 bra $L__BB4_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r5, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @add_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: add_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [add_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b16 %r6, [add_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 65535;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: $L__BB5_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: add.s32 %r10, %r15, %r4;
+; SM60-NEXT: and.b32 %r11, %r10, %r2;
+; SM60-NEXT: and.b32 %r12, %r15, %r3;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM60-NEXT: mov.b32 %r15, %r5;
+; SM60-NEXT: @%p1 bra $L__BB5_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r5, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @add_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: add_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [add_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [add_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @add_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: add_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [add_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.b64 %rd2, [add_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.add.u64 %rd3, [%rd1], %rd2;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @sub_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: sub_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b8 %r6, [sub_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 255;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: $L__BB8_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: sub.s32 %r10, %r15, %r4;
+; SM60-NEXT: and.b32 %r11, %r10, %r2;
+; SM60-NEXT: and.b32 %r12, %r15, %r3;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM60-NEXT: mov.b32 %r15, %r5;
+; SM60-NEXT: @%p1 bra $L__BB8_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r5, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @sub_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: sub_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b16 %r6, [sub_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 65535;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: $L__BB9_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: sub.s32 %r10, %r15, %r4;
+; SM60-NEXT: and.b32 %r11, %r10, %r2;
+; SM60-NEXT: and.b32 %r12, %r15, %r3;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM60-NEXT: mov.b32 %r15, %r5;
+; SM60-NEXT: @%p1 bra $L__BB9_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r5, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @sub_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: sub_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<4>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [sub_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [sub_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: neg.s32 %r2, %r1;
+; SM60-NEXT: atom.cta.global.add.u32 %r3, [%rd1], %r2;
+; SM60-NEXT: st.param.b32 [func_retval0], %r3;
+; SM60-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @sub_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: sub_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<5>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [sub_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: neg.s64 %rd3, %rd2;
+; SM60-NEXT: atom.cta.global.add.u64 %rd4, [%rd1], %rd3;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM60-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @and_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: and_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<12>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [and_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b8 %r1, [and_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd2, %rd1, -4;
+; SM60-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM60-NEXT: and.b32 %r3, %r2, 3;
+; SM60-NEXT: shl.b32 %r4, %r3, 3;
+; SM60-NEXT: mov.b32 %r5, 255;
+; SM60-NEXT: shl.b32 %r6, %r5, %r4;
+; SM60-NEXT: not.b32 %r7, %r6;
+; SM60-NEXT: shl.b32 %r8, %r1, %r4;
+; SM60-NEXT: or.b32 %r9, %r8, %r7;
+; SM60-NEXT: atom.cta.global.and.b32 %r10, [%rd2], %r9;
+; SM60-NEXT: shr.u32 %r11, %r10, %r4;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r11;
+; SM60-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @and_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: and_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<12>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [and_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b16 %r1, [and_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd2, %rd1, -4;
+; SM60-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM60-NEXT: and.b32 %r3, %r2, 3;
+; SM60-NEXT: shl.b32 %r4, %r3, 3;
+; SM60-NEXT: mov.b32 %r5, 65535;
+; SM60-NEXT: shl.b32 %r6, %r5, %r4;
+; SM60-NEXT: not.b32 %r7, %r6;
+; SM60-NEXT: shl.b32 %r8, %r1, %r4;
+; SM60-NEXT: or.b32 %r9, %r8, %r7;
+; SM60-NEXT: atom.cta.global.and.b32 %r10, [%rd2], %r9;
+; SM60-NEXT: shr.u32 %r11, %r10, %r4;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r11;
+; SM60-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @and_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: and_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [and_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [and_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.and.b32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @and_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: and_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [and_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.b64 %rd2, [and_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.and.b64 %rd3, [%rd1], %rd2;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @nand_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: nand_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<17>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b8 %r6, [nand_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 255;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r16, [%rd1];
+; SM60-NEXT: $L__BB16_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r10, %r16, %r4;
+; SM60-NEXT: not.b32 %r11, %r10;
+; SM60-NEXT: and.b32 %r12, %r11, %r2;
+; SM60-NEXT: and.b32 %r13, %r16, %r3;
+; SM60-NEXT: or.b32 %r14, %r13, %r12;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM60-NEXT: mov.b32 %r16, %r5;
+; SM60-NEXT: @%p1 bra $L__BB16_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r15, %r5, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r15;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @nand_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: nand_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<17>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b16 %r6, [nand_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 65535;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r16, [%rd1];
+; SM60-NEXT: $L__BB17_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r10, %r16, %r4;
+; SM60-NEXT: not.b32 %r11, %r10;
+; SM60-NEXT: and.b32 %r12, %r11, %r2;
+; SM60-NEXT: and.b32 %r13, %r16, %r3;
+; SM60-NEXT: or.b32 %r14, %r13, %r12;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM60-NEXT: mov.b32 %r16, %r5;
+; SM60-NEXT: @%p1 bra $L__BB17_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r15, %r5, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r15;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @nand_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: nand_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<6>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [nand_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [nand_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b32 %r5, [%rd1];
+; SM60-NEXT: $L__BB18_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r3, %r5, %r2;
+; SM60-NEXT: not.b32 %r4, %r3;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM60-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM60-NEXT: mov.b32 %r5, %r1;
+; SM60-NEXT: @%p1 bra $L__BB18_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @nand_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: nand_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b64 %rd<7>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd3, [nand_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM60-NEXT: $L__BB19_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b64 %rd4, %rd6, %rd3;
+; SM60-NEXT: not.b64 %rd5, %rd4;
+; SM60-NEXT: atom.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM60-NEXT: setp.ne.b64 %p1, %rd1, %rd6;
+; SM60-NEXT: mov.b64 %rd6, %rd1;
+; SM60-NEXT: @%p1 bra $L__BB19_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @or_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: or_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<8>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [or_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b8 %r1, [or_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd2, %rd1, -4;
+; SM60-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM60-NEXT: and.b32 %r3, %r2, 3;
+; SM60-NEXT: shl.b32 %r4, %r3, 3;
+; SM60-NEXT: shl.b32 %r5, %r1, %r4;
+; SM60-NEXT: atom.cta.global.or.b32 %r6, [%rd2], %r5;
+; SM60-NEXT: shr.u32 %r7, %r6, %r4;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r7;
+; SM60-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @or_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: or_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<8>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [or_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b16 %r1, [or_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd2, %rd1, -4;
+; SM60-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM60-NEXT: and.b32 %r3, %r2, 3;
+; SM60-NEXT: shl.b32 %r4, %r3, 3;
+; SM60-NEXT: shl.b32 %r5, %r1, %r4;
+; SM60-NEXT: atom.cta.global.or.b32 %r6, [%rd2], %r5;
+; SM60-NEXT: shr.u32 %r7, %r6, %r4;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r7;
+; SM60-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @or_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: or_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [or_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [or_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.or.b32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @or_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: or_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [or_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.b64 %rd2, [or_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.or.b64 %rd3, [%rd1], %rd2;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @xor_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: xor_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<8>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b8 %r1, [xor_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd2, %rd1, -4;
+; SM60-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM60-NEXT: and.b32 %r3, %r2, 3;
+; SM60-NEXT: shl.b32 %r4, %r3, 3;
+; SM60-NEXT: shl.b32 %r5, %r1, %r4;
+; SM60-NEXT: atom.cta.global.xor.b32 %r6, [%rd2], %r5;
+; SM60-NEXT: shr.u32 %r7, %r6, %r4;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r7;
+; SM60-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @xor_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: xor_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<8>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b16 %r1, [xor_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd2, %rd1, -4;
+; SM60-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM60-NEXT: and.b32 %r3, %r2, 3;
+; SM60-NEXT: shl.b32 %r4, %r3, 3;
+; SM60-NEXT: shl.b32 %r5, %r1, %r4;
+; SM60-NEXT: atom.cta.global.xor.b32 %r6, [%rd2], %r5;
+; SM60-NEXT: shr.u32 %r7, %r6, %r4;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r7;
+; SM60-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @xor_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: xor_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [xor_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.xor.b32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @xor_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: xor_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.b64 %rd2, [xor_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.xor.b64 %rd3, [%rd1], %rd2;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @max_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: max_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<5>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b8 %rs1, [max_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [max_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 255;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: cvt.s16.s8 %rs3, %rs1;
+; SM60-NEXT: $L__BB28_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r15, %r1;
+; SM60-NEXT: cvt.s8.s32 %rs2, %r8;
+; SM60-NEXT: max.s16 %rs4, %rs2, %rs3;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM60-NEXT: and.b32 %r10, %r9, 255;
+; SM60-NEXT: shl.b32 %r11, %r10, %r1;
+; SM60-NEXT: and.b32 %r12, %r15, %r2;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r15;
+; SM60-NEXT: mov.b32 %r15, %r3;
+; SM60-NEXT: @%p1 bra $L__BB28_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @max_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: max_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<4>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [max_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [max_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB29_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: max.s16 %rs3, %rs2, %rs1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p1 bra $L__BB29_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @max_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: max_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [max_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [max_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.max.s32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @max_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: max_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [max_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.b64 %rd2, [max_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.max.s64 %rd3, [%rd1], %rd2;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @min_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: min_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<5>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b8 %rs1, [min_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [min_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 255;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: cvt.s16.s8 %rs3, %rs1;
+; SM60-NEXT: $L__BB32_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r15, %r1;
+; SM60-NEXT: cvt.s8.s32 %rs2, %r8;
+; SM60-NEXT: min.s16 %rs4, %rs2, %rs3;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM60-NEXT: and.b32 %r10, %r9, 255;
+; SM60-NEXT: shl.b32 %r11, %r10, %r1;
+; SM60-NEXT: and.b32 %r12, %r15, %r2;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r15;
+; SM60-NEXT: mov.b32 %r15, %r3;
+; SM60-NEXT: @%p1 bra $L__BB32_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @min_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: min_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<4>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [min_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [min_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB33_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: min.s16 %rs3, %rs2, %rs1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p1 bra $L__BB33_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @min_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: min_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [min_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [min_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.min.s32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @min_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: min_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [min_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.b64 %rd2, [min_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.min.s64 %rd3, [%rd1], %rd2;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @umax_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: umax_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<5>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b8 %rs1, [umax_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 255;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB36_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: and.b16 %rs3, %rs2, 255;
+; SM60-NEXT: max.u16 %rs4, %rs3, %rs1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p1 bra $L__BB36_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @umax_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: umax_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<4>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [umax_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB37_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: max.u16 %rs3, %rs2, %rs1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p1 bra $L__BB37_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @umax_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: umax_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [umax_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [umax_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.max.u32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @umax_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: umax_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [umax_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.max.u64 %rd3, [%rd1], %rd2;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @umin_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: umin_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<5>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b8 %rs1, [umin_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 255;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB40_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: and.b16 %rs3, %rs2, 255;
+; SM60-NEXT: min.u16 %rs4, %rs3, %rs1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p1 bra $L__BB40_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @umin_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: umin_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<4>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [umin_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB41_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: min.u16 %rs3, %rs2, %rs1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p1 bra $L__BB41_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @umin_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: umin_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [umin_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [umin_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.min.u32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @umin_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: umin_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [umin_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.min.u64 %rd3, [%rd1], %rd2;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @uinc_wrap_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: uinc_wrap_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<6>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b8 %rs1, [uinc_wrap_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 255;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: $L__BB44_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r15, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: and.b16 %rs3, %rs2, 255;
+; SM60-NEXT: add.s16 %rs4, %rs2, 1;
+; SM60-NEXT: setp.ge.u16 %p1, %rs3, %rs1;
+; SM60-NEXT: selp.b16 %rs5, 0, %rs4, %p1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM60-NEXT: and.b32 %r10, %r9, 255;
+; SM60-NEXT: shl.b32 %r11, %r10, %r1;
+; SM60-NEXT: and.b32 %r12, %r15, %r2;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p2, %r3, %r15;
+; SM60-NEXT: mov.b32 %r15, %r3;
+; SM60-NEXT: @%p2 bra $L__BB44_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @uinc_wrap_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: uinc_wrap_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<5>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [uinc_wrap_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB45_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: add.s16 %rs3, %rs2, 1;
+; SM60-NEXT: setp.ge.u16 %p1, %rs2, %rs1;
+; SM60-NEXT: selp.b16 %rs4, 0, %rs3, %p1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p2, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p2 bra $L__BB45_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @uinc_wrap_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: uinc_wrap_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [uinc_wrap_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [uinc_wrap_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.inc.u32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @uinc_wrap_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: uinc_wrap_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b64 %rd<7>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd3, [uinc_wrap_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM60-NEXT: $L__BB47_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: add.s64 %rd4, %rd6, 1;
+; SM60-NEXT: setp.ge.u64 %p1, %rd6, %rd3;
+; SM60-NEXT: selp.b64 %rd5, 0, %rd4, %p1;
+; SM60-NEXT: atom.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM60-NEXT: setp.ne.b64 %p2, %rd1, %rd6;
+; SM60-NEXT: mov.b64 %rd6, %rd1;
+; SM60-NEXT: @%p2 bra $L__BB47_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @udec_wrap_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: udec_wrap_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<4>;
+; SM60-NEXT: .reg .b16 %rs<7>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b8 %rs1, [udec_wrap_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 255;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: $L__BB48_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r15, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: and.b16 %rs3, %rs2, 255;
+; SM60-NEXT: add.s16 %rs4, %rs2, -1;
+; SM60-NEXT: setp.eq.b16 %p1, %rs3, 0;
+; SM60-NEXT: setp.gt.u16 %p2, %rs3, %rs1;
+; SM60-NEXT: selp.b16 %rs5, %rs1, %rs4, %p2;
+; SM60-NEXT: selp.b16 %rs6, %rs1, %rs5, %p1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs6;
+; SM60-NEXT: and.b32 %r10, %r9, 255;
+; SM60-NEXT: shl.b32 %r11, %r10, %r1;
+; SM60-NEXT: and.b32 %r12, %r15, %r2;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p3, %r3, %r15;
+; SM60-NEXT: mov.b32 %r15, %r3;
+; SM60-NEXT: @%p3 bra $L__BB48_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @udec_wrap_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: udec_wrap_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<4>;
+; SM60-NEXT: .reg .b16 %rs<6>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [udec_wrap_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB49_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: add.s16 %rs3, %rs2, -1;
+; SM60-NEXT: setp.eq.b16 %p1, %rs2, 0;
+; SM60-NEXT: setp.gt.u16 %p2, %rs2, %rs1;
+; SM60-NEXT: selp.b16 %rs4, %rs1, %rs3, %p2;
+; SM60-NEXT: selp.b16 %rs5, %rs1, %rs4, %p1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p3, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p3 bra $L__BB49_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @udec_wrap_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: udec_wrap_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [udec_wrap_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [udec_wrap_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.dec.u32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @udec_wrap_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: udec_wrap_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<4>;
+; SM60-NEXT: .reg .b64 %rd<8>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd3, [udec_wrap_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b64 %rd7, [%rd2];
+; SM60-NEXT: $L__BB51_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: add.s64 %rd4, %rd7, -1;
+; SM60-NEXT: setp.eq.b64 %p1, %rd7, 0;
+; SM60-NEXT: setp.gt.u64 %p2, %rd7, %rd3;
+; SM60-NEXT: selp.b64 %rd5, %rd3, %rd4, %p2;
+; SM60-NEXT: selp.b64 %rd6, %rd3, %rd5, %p1;
+; SM60-NEXT: atom.cta.global.cas.b64 %rd1, [%rd2], %rd7, %rd6;
+; SM60-NEXT: setp.ne.b64 %p3, %rd1, %rd7;
+; SM60-NEXT: mov.b64 %rd7, %rd1;
+; SM60-NEXT: @%p3 bra $L__BB51_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @usub_cond_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: usub_cond_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<6>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b8 %rs1, [usub_cond_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 255;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: $L__BB52_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r15, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: and.b16 %rs3, %rs2, 255;
+; SM60-NEXT: setp.ge.u16 %p1, %rs3, %rs1;
+; SM60-NEXT: sub.s16 %rs4, %rs2, %rs1;
+; SM60-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM60-NEXT: and.b32 %r10, %r9, 255;
+; SM60-NEXT: shl.b32 %r11, %r10, %r1;
+; SM60-NEXT: and.b32 %r12, %r15, %r2;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p2, %r3, %r15;
+; SM60-NEXT: mov.b32 %r15, %r3;
+; SM60-NEXT: @%p2 bra $L__BB52_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @usub_cond_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: usub_cond_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<5>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [usub_cond_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB53_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: setp.ge.u16 %p1, %rs2, %rs1;
+; SM60-NEXT: sub.s16 %rs3, %rs2, %rs1;
+; SM60-NEXT: selp.b16 %rs4, %rs3, %rs2, %p1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p2, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p2 bra $L__BB53_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @usub_cond_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: usub_cond_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b32 %r<6>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [usub_cond_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [usub_cond_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b32 %r5, [%rd1];
+; SM60-NEXT: $L__BB54_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: setp.ge.u32 %p1, %r5, %r2;
+; SM60-NEXT: sub.s32 %r3, %r5, %r2;
+; SM60-NEXT: selp.b32 %r4, %r3, %r5, %p1;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM60-NEXT: setp.ne.b32 %p2, %r1, %r5;
+; SM60-NEXT: mov.b32 %r5, %r1;
+; SM60-NEXT: @%p2 bra $L__BB54_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @usub_cond_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: usub_cond_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b64 %rd<7>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd3, [usub_cond_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM60-NEXT: $L__BB55_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: setp.ge.u64 %p1, %rd6, %rd3;
+; SM60-NEXT: sub.s64 %rd4, %rd6, %rd3;
+; SM60-NEXT: selp.b64 %rd5, %rd4, %rd6, %p1;
+; SM60-NEXT: atom.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM60-NEXT: setp.ne.b64 %p2, %rd1, %rd6;
+; SM60-NEXT: mov.b64 %rd6, %rd1;
+; SM60-NEXT: @%p2 bra $L__BB55_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @usub_sat_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: usub_sat_acq_rel_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<6>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b8 %rs1, [usub_sat_acq_rel_i8_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 255;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB56_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: and.b16 %rs3, %rs2, 255;
+; SM60-NEXT: max.u16 %rs4, %rs3, %rs1;
+; SM60-NEXT: sub.s16 %rs5, %rs4, %rs1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p1 bra $L__BB56_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @usub_sat_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM60-LABEL: usub_sat_acq_rel_i16_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<5>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [usub_sat_acq_rel_i16_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i16_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB57_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: max.u16 %rs3, %rs2, %rs1;
+; SM60-NEXT: sub.s16 %rs4, %rs3, %rs1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p1 bra $L__BB57_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @usub_sat_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: usub_sat_acq_rel_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<6>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [usub_sat_acq_rel_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [usub_sat_acq_rel_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b32 %r5, [%rd1];
+; SM60-NEXT: $L__BB58_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: max.u32 %r3, %r5, %r2;
+; SM60-NEXT: sub.s32 %r4, %r3, %r2;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM60-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM60-NEXT: mov.b32 %r5, %r1;
+; SM60-NEXT: @%p1 bra $L__BB58_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @usub_sat_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM60-LABEL: usub_sat_acq_rel_i64_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b64 %rd<7>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd3, [usub_sat_acq_rel_i64_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i64_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM60-NEXT: $L__BB59_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: max.u64 %rd4, %rd6, %rd3;
+; SM60-NEXT: sub.s64 %rd5, %rd4, %rd3;
+; SM60-NEXT: atom.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM60-NEXT: setp.ne.b64 %p1, %rd1, %rd6;
+; SM60-NEXT: mov.b64 %rd6, %rd1;
+; SM60-NEXT: @%p1 bra $L__BB59_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define float @fadd_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM60-LABEL: fadd_acq_rel_float_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_float_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [fadd_acq_rel_float_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.add.f32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fsub_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM60-LABEL: fsub_acq_rel_float_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<5>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [fsub_acq_rel_float_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [fsub_acq_rel_float_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b32 %r4, [%rd1];
+; SM60-NEXT: $L__BB61_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: sub.rn.f32 %r3, %r4, %r2;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
+; SM60-NEXT: setp.ne.b32 %p1, %r1, %r4;
+; SM60-NEXT: mov.b32 %r4, %r1;
+; SM60-NEXT: @%p1 bra $L__BB61_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fmin_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM60-LABEL: fmin_acq_rel_float_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<5>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [fmin_acq_rel_float_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [fmin_acq_rel_float_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b32 %r4, [%rd1];
+; SM60-NEXT: $L__BB62_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: min.f32 %r3, %r4, %r2;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
+; SM60-NEXT: setp.ne.b32 %p1, %r1, %r4;
+; SM60-NEXT: mov.b32 %r4, %r1;
+; SM60-NEXT: @%p1 bra $L__BB62_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fmax_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM60-LABEL: fmax_acq_rel_float_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<5>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [fmax_acq_rel_float_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [fmax_acq_rel_float_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b32 %r4, [%rd1];
+; SM60-NEXT: $L__BB63_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: max.f32 %r3, %r4, %r2;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
+; SM60-NEXT: setp.ne.b32 %p1, %r1, %r4;
+; SM60-NEXT: mov.b32 %r4, %r1;
+; SM60-NEXT: @%p1 bra $L__BB63_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fminimum_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM60-LABEL: fminimum_acq_rel_float_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<6>;
+; SM60-NEXT: .reg .b32 %r<9>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [fminimum_acq_rel_float_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [fminimum_acq_rel_float_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b32 %r8, [%rd1];
+; SM60-NEXT: setp.eq.b32 %p3, %r2, -2147483648;
+; SM60-NEXT: $L__BB64_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: setp.nan.f32 %p1, %r8, %r2;
+; SM60-NEXT: min.f32 %r3, %r8, %r2;
+; SM60-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
+; SM60-NEXT: setp.eq.b32 %p2, %r8, -2147483648;
+; SM60-NEXT: selp.f32 %r5, %r8, %r4, %p2;
+; SM60-NEXT: selp.f32 %r6, %r2, %r5, %p3;
+; SM60-NEXT: setp.eq.f32 %p4, %r4, 0f00000000;
+; SM60-NEXT: selp.f32 %r7, %r6, %r4, %p4;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r8, %r7;
+; SM60-NEXT: setp.ne.b32 %p5, %r1, %r8;
+; SM60-NEXT: mov.b32 %r8, %r1;
+; SM60-NEXT: @%p5 bra $L__BB64_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fmaximum_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM60-LABEL: fmaximum_acq_rel_float_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<6>;
+; SM60-NEXT: .reg .b32 %r<9>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [fmaximum_acq_rel_float_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [fmaximum_acq_rel_float_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b32 %r8, [%rd1];
+; SM60-NEXT: setp.eq.b32 %p3, %r2, 0;
+; SM60-NEXT: $L__BB65_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: setp.nan.f32 %p1, %r8, %r2;
+; SM60-NEXT: max.f32 %r3, %r8, %r2;
+; SM60-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
+; SM60-NEXT: setp.eq.b32 %p2, %r8, 0;
+; SM60-NEXT: selp.f32 %r5, %r8, %r4, %p2;
+; SM60-NEXT: selp.f32 %r6, %r2, %r5, %p3;
+; SM60-NEXT: setp.eq.f32 %p4, %r4, 0f00000000;
+; SM60-NEXT: selp.f32 %r7, %r6, %r4, %p4;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r8, %r7;
+; SM60-NEXT: setp.ne.b32 %p5, %r1, %r8;
+; SM60-NEXT: mov.b32 %r8, %r1;
+; SM60-NEXT: @%p5 bra $L__BB65_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define double @fadd_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM60-LABEL: fadd_acq_rel_double_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b64 %rd<4>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_double_global_cta_param_0];
+; SM60-NEXT: ld.param.b64 %rd2, [fadd_acq_rel_double_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.add.f64 %rd3, [%rd1], %rd2;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fsub_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM60-LABEL: fsub_acq_rel_double_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b64 %rd<6>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd3, [fsub_acq_rel_double_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_double_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b64 %rd5, [%rd2];
+; SM60-NEXT: $L__BB67_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: sub.rn.f64 %rd4, %rd5, %rd3;
+; SM60-NEXT: atom.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
+; SM60-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
+; SM60-NEXT: mov.b64 %rd5, %rd1;
+; SM60-NEXT: @%p1 bra $L__BB67_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fmin_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM60-LABEL: fmin_acq_rel_double_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b64 %rd<6>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd3, [fmin_acq_rel_double_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_double_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b64 %rd5, [%rd2];
+; SM60-NEXT: $L__BB68_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: min.f64 %rd4, %rd5, %rd3;
+; SM60-NEXT: atom.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
+; SM60-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
+; SM60-NEXT: mov.b64 %rd5, %rd1;
+; SM60-NEXT: @%p1 bra $L__BB68_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fmax_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM60-LABEL: fmax_acq_rel_double_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b64 %rd<6>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd3, [fmax_acq_rel_double_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_double_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b64 %rd5, [%rd2];
+; SM60-NEXT: $L__BB69_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: max.f64 %rd4, %rd5, %rd3;
+; SM60-NEXT: atom.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
+; SM60-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
+; SM60-NEXT: mov.b64 %rd5, %rd1;
+; SM60-NEXT: @%p1 bra $L__BB69_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fminimum_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM60-LABEL: fminimum_acq_rel_double_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<6>;
+; SM60-NEXT: .reg .b64 %rd<10>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd3, [fminimum_acq_rel_double_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_double_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b64 %rd9, [%rd2];
+; SM60-NEXT: setp.eq.b64 %p3, %rd3, -9223372036854775808;
+; SM60-NEXT: $L__BB70_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: setp.nan.f64 %p1, %rd9, %rd3;
+; SM60-NEXT: min.f64 %rd4, %rd9, %rd3;
+; SM60-NEXT: selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
+; SM60-NEXT: setp.eq.b64 %p2, %rd9, -9223372036854775808;
+; SM60-NEXT: selp.f64 %rd6, %rd9, %rd5, %p2;
+; SM60-NEXT: selp.f64 %rd7, %rd3, %rd6, %p3;
+; SM60-NEXT: setp.eq.f64 %p4, %rd5, 0d0000000000000000;
+; SM60-NEXT: selp.f64 %rd8, %rd7, %rd5, %p4;
+; SM60-NEXT: atom.cta.global.cas.b64 %rd1, [%rd2], %rd9, %rd8;
+; SM60-NEXT: setp.ne.b64 %p5, %rd1, %rd9;
+; SM60-NEXT: mov.b64 %rd9, %rd1;
+; SM60-NEXT: @%p5 bra $L__BB70_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fmaximum_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM60-LABEL: fmaximum_acq_rel_double_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<6>;
+; SM60-NEXT: .reg .b64 %rd<10>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd3, [fmaximum_acq_rel_double_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_double_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b64 %rd9, [%rd2];
+; SM60-NEXT: setp.eq.b64 %p3, %rd3, 0;
+; SM60-NEXT: $L__BB71_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: setp.nan.f64 %p1, %rd9, %rd3;
+; SM60-NEXT: max.f64 %rd4, %rd9, %rd3;
+; SM60-NEXT: selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
+; SM60-NEXT: setp.eq.b64 %p2, %rd9, 0;
+; SM60-NEXT: selp.f64 %rd6, %rd9, %rd5, %p2;
+; SM60-NEXT: selp.f64 %rd7, %rd3, %rd6, %p3;
+; SM60-NEXT: setp.eq.f64 %p4, %rd5, 0d0000000000000000;
+; SM60-NEXT: selp.f64 %rd8, %rd7, %rd5, %p4;
+; SM60-NEXT: atom.cta.global.cas.b64 %rd1, [%rd2], %rd9, %rd8;
+; SM60-NEXT: setp.ne.b64 %p5, %rd1, %rd9;
+; SM60-NEXT: mov.b64 %rd9, %rd1;
+; SM60-NEXT: @%p5 bra $L__BB71_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define half @fadd_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM60-LABEL: fadd_acq_rel_half_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<4>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fadd_acq_rel_half_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fadd_acq_rel_half_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB72_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: add.rn.f16 %rs3, %rs2, %rs1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p1 bra $L__BB72_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fsub_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM60-LABEL: fsub_acq_rel_half_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<4>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fsub_acq_rel_half_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_half_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: $L__BB73_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: sub.rn.f16 %rs3, %rs2, %rs1;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p1 bra $L__BB73_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fmin_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM60-LABEL: fmin_acq_rel_half_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<4>;
+; SM60-NEXT: .reg .b32 %r<18>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fmin_acq_rel_half_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_half_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r17, [%rd1];
+; SM60-NEXT: cvt.f32.f16 %r10, %rs1;
+; SM60-NEXT: $L__BB74_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r17, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: cvt.f32.f16 %r9, %rs2;
+; SM60-NEXT: min.f32 %r11, %r9, %r10;
+; SM60-NEXT: cvt.rn.f16.f32 %rs3, %r11;
+; SM60-NEXT: cvt.u32.u16 %r12, %rs3;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: and.b32 %r14, %r17, %r2;
+; SM60-NEXT: or.b32 %r15, %r14, %r13;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r17, %r15;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r17;
+; SM60-NEXT: mov.b32 %r17, %r3;
+; SM60-NEXT: @%p1 bra $L__BB74_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r16, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r16;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fmax_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM60-LABEL: fmax_acq_rel_half_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b16 %rs<4>;
+; SM60-NEXT: .reg .b32 %r<18>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fmax_acq_rel_half_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_half_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r17, [%rd1];
+; SM60-NEXT: cvt.f32.f16 %r10, %rs1;
+; SM60-NEXT: $L__BB75_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r17, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: cvt.f32.f16 %r9, %rs2;
+; SM60-NEXT: max.f32 %r11, %r9, %r10;
+; SM60-NEXT: cvt.rn.f16.f32 %rs3, %r11;
+; SM60-NEXT: cvt.u32.u16 %r12, %rs3;
+; SM60-NEXT: shl.b32 %r13, %r12, %r1;
+; SM60-NEXT: and.b32 %r14, %r17, %r2;
+; SM60-NEXT: or.b32 %r15, %r14, %r13;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r17, %r15;
+; SM60-NEXT: setp.ne.b32 %p1, %r3, %r17;
+; SM60-NEXT: mov.b32 %r17, %r3;
+; SM60-NEXT: @%p1 bra $L__BB75_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r16, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r16;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fminimum_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM60-LABEL: fminimum_acq_rel_half_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<7>;
+; SM60-NEXT: .reg .b16 %rs<9>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fminimum_acq_rel_half_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_half_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: setp.eq.b16 %p4, %rs1, -32768;
+; SM60-NEXT: $L__BB76_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: setp.lt.f16 %p1, %rs2, %rs1;
+; SM60-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
+; SM60-NEXT: setp.nan.f16 %p2, %rs2, %rs1;
+; SM60-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2;
+; SM60-NEXT: setp.eq.b16 %p3, %rs2, -32768;
+; SM60-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
+; SM60-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
+; SM60-NEXT: mov.b16 %rs7, 0x0000;
+; SM60-NEXT: setp.eq.f16 %p5, %rs4, %rs7;
+; SM60-NEXT: selp.b16 %rs8, %rs6, %rs4, %p5;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs8;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p6, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p6 bra $L__BB76_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fmaximum_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM60-LABEL: fmaximum_acq_rel_half_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<7>;
+; SM60-NEXT: .reg .b16 %rs<9>;
+; SM60-NEXT: .reg .b32 %r<15>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fmaximum_acq_rel_half_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_half_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r14, [%rd1];
+; SM60-NEXT: setp.eq.b16 %p4, %rs1, 0;
+; SM60-NEXT: $L__BB77_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r14, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: setp.gt.f16 %p1, %rs2, %rs1;
+; SM60-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
+; SM60-NEXT: setp.nan.f16 %p2, %rs2, %rs1;
+; SM60-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2;
+; SM60-NEXT: setp.eq.b16 %p3, %rs2, 0;
+; SM60-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
+; SM60-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
+; SM60-NEXT: mov.b16 %rs7, 0x0000;
+; SM60-NEXT: setp.eq.f16 %p5, %rs4, %rs7;
+; SM60-NEXT: selp.b16 %rs8, %rs6, %rs4, %p5;
+; SM60-NEXT: cvt.u32.u16 %r9, %rs8;
+; SM60-NEXT: shl.b32 %r10, %r9, %r1;
+; SM60-NEXT: and.b32 %r11, %r14, %r2;
+; SM60-NEXT: or.b32 %r12, %r11, %r10;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM60-NEXT: setp.ne.b32 %p6, %r3, %r14;
+; SM60-NEXT: mov.b32 %r14, %r3;
+; SM60-NEXT: @%p6 bra $L__BB77_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r13, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r13;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define bfloat @fadd_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM60-LABEL: fadd_acq_rel_bfloat_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<24>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fadd_acq_rel_bfloat_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fadd_acq_rel_bfloat_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r23, [%rd1];
+; SM60-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM60-NEXT: shl.b32 %r11, %r10, 16;
+; SM60-NEXT: $L__BB78_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r23, %r1;
+; SM60-NEXT: shl.b32 %r9, %r8, 16;
+; SM60-NEXT: add.rn.f32 %r12, %r9, %r11;
+; SM60-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; SM60-NEXT: add.s32 %r14, %r13, %r12;
+; SM60-NEXT: add.s32 %r15, %r14, 32767;
+; SM60-NEXT: setp.nan.f32 %p1, %r12, %r12;
+; SM60-NEXT: or.b32 %r16, %r12, 4194304;
+; SM60-NEXT: selp.b32 %r17, %r16, %r15, %p1;
+; SM60-NEXT: shr.u32 %r18, %r17, 16;
+; SM60-NEXT: shl.b32 %r19, %r18, %r1;
+; SM60-NEXT: and.b32 %r20, %r23, %r2;
+; SM60-NEXT: or.b32 %r21, %r20, %r19;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
+; SM60-NEXT: setp.ne.b32 %p2, %r3, %r23;
+; SM60-NEXT: mov.b32 %r23, %r3;
+; SM60-NEXT: @%p2 bra $L__BB78_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r22, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r22;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fsub_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM60-LABEL: fsub_acq_rel_bfloat_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<24>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fsub_acq_rel_bfloat_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_bfloat_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r23, [%rd1];
+; SM60-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM60-NEXT: shl.b32 %r11, %r10, 16;
+; SM60-NEXT: $L__BB79_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r23, %r1;
+; SM60-NEXT: shl.b32 %r9, %r8, 16;
+; SM60-NEXT: sub.rn.f32 %r12, %r9, %r11;
+; SM60-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; SM60-NEXT: add.s32 %r14, %r13, %r12;
+; SM60-NEXT: add.s32 %r15, %r14, 32767;
+; SM60-NEXT: setp.nan.f32 %p1, %r12, %r12;
+; SM60-NEXT: or.b32 %r16, %r12, 4194304;
+; SM60-NEXT: selp.b32 %r17, %r16, %r15, %p1;
+; SM60-NEXT: shr.u32 %r18, %r17, 16;
+; SM60-NEXT: shl.b32 %r19, %r18, %r1;
+; SM60-NEXT: and.b32 %r20, %r23, %r2;
+; SM60-NEXT: or.b32 %r21, %r20, %r19;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
+; SM60-NEXT: setp.ne.b32 %p2, %r3, %r23;
+; SM60-NEXT: mov.b32 %r23, %r3;
+; SM60-NEXT: @%p2 bra $L__BB79_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r22, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r22;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fmin_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM60-LABEL: fmin_acq_rel_bfloat_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<24>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fmin_acq_rel_bfloat_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_bfloat_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r23, [%rd1];
+; SM60-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM60-NEXT: shl.b32 %r11, %r10, 16;
+; SM60-NEXT: $L__BB80_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r23, %r1;
+; SM60-NEXT: shl.b32 %r9, %r8, 16;
+; SM60-NEXT: min.f32 %r12, %r9, %r11;
+; SM60-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; SM60-NEXT: add.s32 %r14, %r13, %r12;
+; SM60-NEXT: add.s32 %r15, %r14, 32767;
+; SM60-NEXT: setp.nan.f32 %p1, %r12, %r12;
+; SM60-NEXT: or.b32 %r16, %r12, 4194304;
+; SM60-NEXT: selp.b32 %r17, %r16, %r15, %p1;
+; SM60-NEXT: shr.u32 %r18, %r17, 16;
+; SM60-NEXT: shl.b32 %r19, %r18, %r1;
+; SM60-NEXT: and.b32 %r20, %r23, %r2;
+; SM60-NEXT: or.b32 %r21, %r20, %r19;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
+; SM60-NEXT: setp.ne.b32 %p2, %r3, %r23;
+; SM60-NEXT: mov.b32 %r23, %r3;
+; SM60-NEXT: @%p2 bra $L__BB80_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r22, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r22;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fmax_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM60-LABEL: fmax_acq_rel_bfloat_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<3>;
+; SM60-NEXT: .reg .b16 %rs<2>;
+; SM60-NEXT: .reg .b32 %r<24>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fmax_acq_rel_bfloat_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_bfloat_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r23, [%rd1];
+; SM60-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM60-NEXT: shl.b32 %r11, %r10, 16;
+; SM60-NEXT: $L__BB81_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r23, %r1;
+; SM60-NEXT: shl.b32 %r9, %r8, 16;
+; SM60-NEXT: max.f32 %r12, %r9, %r11;
+; SM60-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; SM60-NEXT: add.s32 %r14, %r13, %r12;
+; SM60-NEXT: add.s32 %r15, %r14, 32767;
+; SM60-NEXT: setp.nan.f32 %p1, %r12, %r12;
+; SM60-NEXT: or.b32 %r16, %r12, 4194304;
+; SM60-NEXT: selp.b32 %r17, %r16, %r15, %p1;
+; SM60-NEXT: shr.u32 %r18, %r17, 16;
+; SM60-NEXT: shl.b32 %r19, %r18, %r1;
+; SM60-NEXT: and.b32 %r20, %r23, %r2;
+; SM60-NEXT: or.b32 %r21, %r20, %r19;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
+; SM60-NEXT: setp.ne.b32 %p2, %r3, %r23;
+; SM60-NEXT: mov.b32 %r23, %r3;
+; SM60-NEXT: @%p2 bra $L__BB81_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r22, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r22;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fminimum_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM60-LABEL: fminimum_acq_rel_bfloat_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<7>;
+; SM60-NEXT: .reg .b16 %rs<8>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fminimum_acq_rel_bfloat_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_bfloat_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r19, [%rd1];
+; SM60-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM60-NEXT: shl.b32 %r11, %r10, 16;
+; SM60-NEXT: setp.eq.b16 %p4, %rs1, -32768;
+; SM60-NEXT: $L__BB82_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r19, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: shl.b32 %r9, %r8, 16;
+; SM60-NEXT: setp.lt.f32 %p1, %r9, %r11;
+; SM60-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
+; SM60-NEXT: setp.nan.f32 %p2, %r9, %r11;
+; SM60-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2;
+; SM60-NEXT: setp.eq.b16 %p3, %rs2, -32768;
+; SM60-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
+; SM60-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
+; SM60-NEXT: cvt.u32.u16 %r12, %rs4;
+; SM60-NEXT: shl.b32 %r13, %r12, 16;
+; SM60-NEXT: setp.eq.f32 %p5, %r13, 0f00000000;
+; SM60-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs7;
+; SM60-NEXT: shl.b32 %r15, %r14, %r1;
+; SM60-NEXT: and.b32 %r16, %r19, %r2;
+; SM60-NEXT: or.b32 %r17, %r16, %r15;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r19, %r17;
+; SM60-NEXT: setp.ne.b32 %p6, %r3, %r19;
+; SM60-NEXT: mov.b32 %r19, %r3;
+; SM60-NEXT: @%p6 bra $L__BB82_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r18, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r18;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fmaximum_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM60-LABEL: fmaximum_acq_rel_bfloat_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<7>;
+; SM60-NEXT: .reg .b16 %rs<8>;
+; SM60-NEXT: .reg .b32 %r<20>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b16 %rs1, [fmaximum_acq_rel_bfloat_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_bfloat_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM60-NEXT: and.b32 %r5, %r4, 3;
+; SM60-NEXT: shl.b32 %r1, %r5, 3;
+; SM60-NEXT: mov.b32 %r6, 65535;
+; SM60-NEXT: shl.b32 %r7, %r6, %r1;
+; SM60-NEXT: not.b32 %r2, %r7;
+; SM60-NEXT: ld.global.b32 %r19, [%rd1];
+; SM60-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM60-NEXT: shl.b32 %r11, %r10, 16;
+; SM60-NEXT: setp.eq.b16 %p4, %rs1, 0;
+; SM60-NEXT: $L__BB83_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: shr.u32 %r8, %r19, %r1;
+; SM60-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM60-NEXT: shl.b32 %r9, %r8, 16;
+; SM60-NEXT: setp.gt.f32 %p1, %r9, %r11;
+; SM60-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
+; SM60-NEXT: setp.nan.f32 %p2, %r9, %r11;
+; SM60-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2;
+; SM60-NEXT: setp.eq.b16 %p3, %rs2, 0;
+; SM60-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
+; SM60-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
+; SM60-NEXT: cvt.u32.u16 %r12, %rs4;
+; SM60-NEXT: shl.b32 %r13, %r12, 16;
+; SM60-NEXT: setp.eq.f32 %p5, %r13, 0f00000000;
+; SM60-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5;
+; SM60-NEXT: cvt.u32.u16 %r14, %rs7;
+; SM60-NEXT: shl.b32 %r15, %r14, %r1;
+; SM60-NEXT: and.b32 %r16, %r19, %r2;
+; SM60-NEXT: or.b32 %r17, %r16, %r15;
+; SM60-NEXT: atom.cta.global.cas.b32 %r3, [%rd1], %r19, %r17;
+; SM60-NEXT: setp.ne.b32 %p6, %r3, %r19;
+; SM60-NEXT: mov.b32 %r19, %r3;
+; SM60-NEXT: @%p6 bra $L__BB83_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r18, %r3, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b16 [func_retval0], %r18;
+; SM60-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define i8 @add_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: add_monotonic_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [add_monotonic_i8_global_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.b8 %r6, [add_monotonic_i8_global_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 255;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: $L__BB84_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: add.s32 %r10, %r15, %r4;
+; SM60-NEXT: and.b32 %r11, %r10, %r2;
+; SM60-NEXT: and.b32 %r12, %r15, %r3;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM60-NEXT: mov.b32 %r15, %r5;
+; SM60-NEXT: @%p1 bra $L__BB84_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r5, %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") monotonic
+ ret i8 %retval
+}
+
+define i8 @add_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: add_acquire_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [add_acquire_i8_global_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.b8 %r6, [add_acquire_i8_global_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 255;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: $L__BB85_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: add.s32 %r10, %r15, %r4;
+; SM60-NEXT: and.b32 %r11, %r10, %r2;
+; SM60-NEXT: and.b32 %r12, %r15, %r3;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM60-NEXT: mov.b32 %r15, %r5;
+; SM60-NEXT: @%p1 bra $L__BB85_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r5, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") acquire
+ ret i8 %retval
+}
+
+define i8 @add_release_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: add_release_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [add_release_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b8 %r6, [add_release_i8_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 255;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: $L__BB86_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: add.s32 %r10, %r15, %r4;
+; SM60-NEXT: and.b32 %r11, %r10, %r2;
+; SM60-NEXT: and.b32 %r12, %r15, %r3;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM60-NEXT: mov.b32 %r15, %r5;
+; SM60-NEXT: @%p1 bra $L__BB86_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r5, %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") release
+ ret i8 %retval
+}
+
+define i8 @add_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: add_seq_cst_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<16>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [add_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b8 %r6, [add_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 255;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r15, [%rd1];
+; SM60-NEXT: $L__BB87_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: add.s32 %r10, %r15, %r4;
+; SM60-NEXT: and.b32 %r11, %r10, %r2;
+; SM60-NEXT: and.b32 %r12, %r15, %r3;
+; SM60-NEXT: or.b32 %r13, %r12, %r11;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM60-NEXT: mov.b32 %r15, %r5;
+; SM60-NEXT: @%p1 bra $L__BB87_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r14, %r5, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r14;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") seq_cst
+ ret i8 %retval
+}
+
+define i32 @add_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: add_monotonic_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [add_monotonic_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [add_monotonic_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") monotonic
+ ret i32 %retval
+}
+
+define i32 @add_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: add_acquire_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [add_acquire_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [add_acquire_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") acquire
+ ret i32 %retval
+}
+
+define i32 @add_release_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: add_release_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [add_release_i32_global_cta_param_0];
+; SM60-NEXT: ld.param.b32 %r1, [add_release_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") release
+ ret i32 %retval
+}
+
+define i32 @add_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: add_seq_cst_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .b32 %r<3>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd1, [add_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b32 %r1, [add_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT: atom.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r2;
+; SM60-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") seq_cst
+ ret i32 %retval
+}
+
+define i8 @nand_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: nand_monotonic_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<17>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [nand_monotonic_i8_global_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.b8 %r6, [nand_monotonic_i8_global_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 255;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r16, [%rd1];
+; SM60-NEXT: $L__BB92_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r10, %r16, %r4;
+; SM60-NEXT: not.b32 %r11, %r10;
+; SM60-NEXT: and.b32 %r12, %r11, %r2;
+; SM60-NEXT: and.b32 %r13, %r16, %r3;
+; SM60-NEXT: or.b32 %r14, %r13, %r12;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM60-NEXT: mov.b32 %r16, %r5;
+; SM60-NEXT: @%p1 bra $L__BB92_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r15, %r5, %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r15;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") monotonic
+ ret i8 %retval
+}
+
+define i8 @nand_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: nand_acquire_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<17>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [nand_acquire_i8_global_cta_param_0];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: ld.param.b8 %r6, [nand_acquire_i8_global_cta_param_1];
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 255;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r16, [%rd1];
+; SM60-NEXT: $L__BB93_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r10, %r16, %r4;
+; SM60-NEXT: not.b32 %r11, %r10;
+; SM60-NEXT: and.b32 %r12, %r11, %r2;
+; SM60-NEXT: and.b32 %r13, %r16, %r3;
+; SM60-NEXT: or.b32 %r14, %r13, %r12;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM60-NEXT: mov.b32 %r16, %r5;
+; SM60-NEXT: @%p1 bra $L__BB93_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r15, %r5, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r15;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") acquire
+ ret i8 %retval
+}
+
+define i8 @nand_release_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: nand_release_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<17>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [nand_release_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b8 %r6, [nand_release_i8_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 255;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r16, [%rd1];
+; SM60-NEXT: $L__BB94_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r10, %r16, %r4;
+; SM60-NEXT: not.b32 %r11, %r10;
+; SM60-NEXT: and.b32 %r12, %r11, %r2;
+; SM60-NEXT: and.b32 %r13, %r16, %r3;
+; SM60-NEXT: or.b32 %r14, %r13, %r12;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM60-NEXT: mov.b32 %r16, %r5;
+; SM60-NEXT: @%p1 bra $L__BB94_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r15, %r5, %r1;
+; SM60-NEXT: st.param.b32 [func_retval0], %r15;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") release
+ ret i8 %retval
+}
+
+define i8 @nand_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM60-LABEL: nand_seq_cst_i8_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<17>;
+; SM60-NEXT: .reg .b64 %rd<3>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b64 %rd2, [nand_seq_cst_i8_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.param.b8 %r6, [nand_seq_cst_i8_global_cta_param_1];
+; SM60-NEXT: and.b64 %rd1, %rd2, -4;
+; SM60-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM60-NEXT: and.b32 %r8, %r7, 3;
+; SM60-NEXT: shl.b32 %r1, %r8, 3;
+; SM60-NEXT: mov.b32 %r9, 255;
+; SM60-NEXT: shl.b32 %r2, %r9, %r1;
+; SM60-NEXT: not.b32 %r3, %r2;
+; SM60-NEXT: shl.b32 %r4, %r6, %r1;
+; SM60-NEXT: ld.global.b32 %r16, [%rd1];
+; SM60-NEXT: $L__BB95_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r10, %r16, %r4;
+; SM60-NEXT: not.b32 %r11, %r10;
+; SM60-NEXT: and.b32 %r12, %r11, %r2;
+; SM60-NEXT: and.b32 %r13, %r16, %r3;
+; SM60-NEXT: or.b32 %r14, %r13, %r12;
+; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM60-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM60-NEXT: mov.b32 %r16, %r5;
+; SM60-NEXT: @%p1 bra $L__BB95_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: shr.u32 %r15, %r5, %r1;
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r15;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") seq_cst
+ ret i8 %retval
+}
+
+define i32 @nand_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: nand_monotonic_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<6>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [nand_monotonic_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [nand_monotonic_i32_global_cta_param_0];
+; SM60-NEXT: ld.global.b32 %r5, [%rd1];
+; SM60-NEXT: $L__BB96_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r3, %r5, %r2;
+; SM60-NEXT: not.b32 %r4, %r3;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM60-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM60-NEXT: mov.b32 %r5, %r1;
+; SM60-NEXT: @%p1 bra $L__BB96_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") monotonic
+ ret i32 %retval
+}
+
+define i32 @nand_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: nand_acquire_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<6>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [nand_acquire_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [nand_acquire_i32_global_cta_param_0];
+; SM60-NEXT: ld.global.b32 %r5, [%rd1];
+; SM60-NEXT: $L__BB97_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r3, %r5, %r2;
+; SM60-NEXT: not.b32 %r4, %r3;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM60-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM60-NEXT: mov.b32 %r5, %r1;
+; SM60-NEXT: @%p1 bra $L__BB97_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") acquire
+ ret i32 %retval
+}
+
+define i32 @nand_release_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: nand_release_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<6>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [nand_release_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [nand_release_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b32 %r5, [%rd1];
+; SM60-NEXT: $L__BB98_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r3, %r5, %r2;
+; SM60-NEXT: not.b32 %r4, %r3;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM60-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM60-NEXT: mov.b32 %r5, %r1;
+; SM60-NEXT: @%p1 bra $L__BB98_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") release
+ ret i32 %retval
+}
+
+define i32 @nand_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM60-LABEL: nand_seq_cst_i32_global_cta(
+; SM60: {
+; SM60-NEXT: .reg .pred %p<2>;
+; SM60-NEXT: .reg .b32 %r<6>;
+; SM60-NEXT: .reg .b64 %rd<2>;
+; SM60-EMPTY:
+; SM60-NEXT: // %bb.0:
+; SM60-NEXT: ld.param.b32 %r2, [nand_seq_cst_i32_global_cta_param_1];
+; SM60-NEXT: ld.param.b64 %rd1, [nand_seq_cst_i32_global_cta_param_0];
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: ld.global.b32 %r5, [%rd1];
+; SM60-NEXT: $L__BB99_1: // %atomicrmw.start
+; SM60-NEXT: // =>This Inner Loop Header: Depth=1
+; SM60-NEXT: and.b32 %r3, %r5, %r2;
+; SM60-NEXT: not.b32 %r4, %r3;
+; SM60-NEXT: atom.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM60-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM60-NEXT: mov.b32 %r5, %r1;
+; SM60-NEXT: @%p1 bra $L__BB99_1;
+; SM60-NEXT: // %bb.2: // %atomicrmw.end
+; SM60-NEXT: membar.cta;
+; SM60-NEXT: st.param.b32 [func_retval0], %r1;
+; SM60-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") seq_cst
+ ret i32 %retval
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-sm70.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-sm70.ll
new file mode 100644
index 0000000000000..0bcc69f34d432
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/atomicrmw-sm70.ll
@@ -0,0 +1,3111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+
+define i8 @xchg_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: xchg_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<14>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b8 %r5, [xchg_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r6, %rd2;
+; SM70-NEXT: and.b32 %r7, %r6, 3;
+; SM70-NEXT: shl.b32 %r1, %r7, 3;
+; SM70-NEXT: mov.b32 %r8, 255;
+; SM70-NEXT: shl.b32 %r9, %r8, %r1;
+; SM70-NEXT: not.b32 %r2, %r9;
+; SM70-NEXT: shl.b32 %r3, %r5, %r1;
+; SM70-NEXT: ld.global.b32 %r13, [%rd1];
+; SM70-NEXT: $L__BB0_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r10, %r13, %r2;
+; SM70-NEXT: or.b32 %r11, %r10, %r3;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r4, [%rd1], %r13, %r11;
+; SM70-NEXT: setp.ne.b32 %p1, %r4, %r13;
+; SM70-NEXT: mov.b32 %r13, %r4;
+; SM70-NEXT: @%p1 bra $L__BB0_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r12, %r4, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r12;
+; SM70-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @xchg_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: xchg_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<14>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b16 %r5, [xchg_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r6, %rd2;
+; SM70-NEXT: and.b32 %r7, %r6, 3;
+; SM70-NEXT: shl.b32 %r1, %r7, 3;
+; SM70-NEXT: mov.b32 %r8, 65535;
+; SM70-NEXT: shl.b32 %r9, %r8, %r1;
+; SM70-NEXT: not.b32 %r2, %r9;
+; SM70-NEXT: shl.b32 %r3, %r5, %r1;
+; SM70-NEXT: ld.global.b32 %r13, [%rd1];
+; SM70-NEXT: $L__BB1_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r10, %r13, %r2;
+; SM70-NEXT: or.b32 %r11, %r10, %r3;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r4, [%rd1], %r13, %r11;
+; SM70-NEXT: setp.ne.b32 %p1, %r4, %r13;
+; SM70-NEXT: mov.b32 %r13, %r4;
+; SM70-NEXT: @%p1 bra $L__BB1_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r12, %r4, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r12;
+; SM70-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @xchg_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: xchg_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [xchg_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [xchg_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.exch.b32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @xchg_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: xchg_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [xchg_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.exch.b64 %rd3, [%rd1], %rd2;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @add_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: add_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [add_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b8 %r6, [add_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 255;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: $L__BB4_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: add.s32 %r10, %r15, %r4;
+; SM70-NEXT: and.b32 %r11, %r10, %r2;
+; SM70-NEXT: and.b32 %r12, %r15, %r3;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM70-NEXT: mov.b32 %r15, %r5;
+; SM70-NEXT: @%p1 bra $L__BB4_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r5, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @add_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: add_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [add_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b16 %r6, [add_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 65535;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: $L__BB5_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: add.s32 %r10, %r15, %r4;
+; SM70-NEXT: and.b32 %r11, %r10, %r2;
+; SM70-NEXT: and.b32 %r12, %r15, %r3;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM70-NEXT: mov.b32 %r15, %r5;
+; SM70-NEXT: @%p1 bra $L__BB5_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r5, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @add_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: add_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [add_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [add_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @add_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: add_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [add_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.b64 %rd2, [add_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.add.u64 %rd3, [%rd1], %rd2;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @sub_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: sub_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b8 %r6, [sub_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 255;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: $L__BB8_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: sub.s32 %r10, %r15, %r4;
+; SM70-NEXT: and.b32 %r11, %r10, %r2;
+; SM70-NEXT: and.b32 %r12, %r15, %r3;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM70-NEXT: mov.b32 %r15, %r5;
+; SM70-NEXT: @%p1 bra $L__BB8_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r5, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @sub_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: sub_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b16 %r6, [sub_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 65535;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: $L__BB9_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: sub.s32 %r10, %r15, %r4;
+; SM70-NEXT: and.b32 %r11, %r10, %r2;
+; SM70-NEXT: and.b32 %r12, %r15, %r3;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM70-NEXT: mov.b32 %r15, %r5;
+; SM70-NEXT: @%p1 bra $L__BB9_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r5, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @sub_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: sub_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<4>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [sub_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [sub_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: neg.s32 %r2, %r1;
+; SM70-NEXT: atom.acq_rel.cta.global.add.u32 %r3, [%rd1], %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r3;
+; SM70-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @sub_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: sub_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [sub_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: neg.s64 %rd3, %rd2;
+; SM70-NEXT: atom.acq_rel.cta.global.add.u64 %rd4, [%rd1], %rd3;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM70-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @and_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: and_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<12>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [and_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b8 %r1, [and_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd2, %rd1, -4;
+; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM70-NEXT: and.b32 %r3, %r2, 3;
+; SM70-NEXT: shl.b32 %r4, %r3, 3;
+; SM70-NEXT: mov.b32 %r5, 255;
+; SM70-NEXT: shl.b32 %r6, %r5, %r4;
+; SM70-NEXT: not.b32 %r7, %r6;
+; SM70-NEXT: shl.b32 %r8, %r1, %r4;
+; SM70-NEXT: or.b32 %r9, %r8, %r7;
+; SM70-NEXT: atom.relaxed.cta.global.and.b32 %r10, [%rd2], %r9;
+; SM70-NEXT: shr.u32 %r11, %r10, %r4;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r11;
+; SM70-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @and_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: and_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<12>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [and_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b16 %r1, [and_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd2, %rd1, -4;
+; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM70-NEXT: and.b32 %r3, %r2, 3;
+; SM70-NEXT: shl.b32 %r4, %r3, 3;
+; SM70-NEXT: mov.b32 %r5, 65535;
+; SM70-NEXT: shl.b32 %r6, %r5, %r4;
+; SM70-NEXT: not.b32 %r7, %r6;
+; SM70-NEXT: shl.b32 %r8, %r1, %r4;
+; SM70-NEXT: or.b32 %r9, %r8, %r7;
+; SM70-NEXT: atom.relaxed.cta.global.and.b32 %r10, [%rd2], %r9;
+; SM70-NEXT: shr.u32 %r11, %r10, %r4;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r11;
+; SM70-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @and_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: and_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [and_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [and_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.and.b32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @and_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: and_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [and_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.b64 %rd2, [and_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.and.b64 %rd3, [%rd1], %rd2;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @nand_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: nand_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<17>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b8 %r6, [nand_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 255;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r16, [%rd1];
+; SM70-NEXT: $L__BB16_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r10, %r16, %r4;
+; SM70-NEXT: not.b32 %r11, %r10;
+; SM70-NEXT: and.b32 %r12, %r11, %r2;
+; SM70-NEXT: and.b32 %r13, %r16, %r3;
+; SM70-NEXT: or.b32 %r14, %r13, %r12;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM70-NEXT: mov.b32 %r16, %r5;
+; SM70-NEXT: @%p1 bra $L__BB16_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r15, %r5, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r15;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @nand_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: nand_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<17>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b16 %r6, [nand_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 65535;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r16, [%rd1];
+; SM70-NEXT: $L__BB17_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r10, %r16, %r4;
+; SM70-NEXT: not.b32 %r11, %r10;
+; SM70-NEXT: and.b32 %r12, %r11, %r2;
+; SM70-NEXT: and.b32 %r13, %r16, %r3;
+; SM70-NEXT: or.b32 %r14, %r13, %r12;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM70-NEXT: mov.b32 %r16, %r5;
+; SM70-NEXT: @%p1 bra $L__BB17_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r15, %r5, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r15;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @nand_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: nand_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<6>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [nand_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [nand_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b32 %r5, [%rd1];
+; SM70-NEXT: $L__BB18_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r3, %r5, %r2;
+; SM70-NEXT: not.b32 %r4, %r3;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM70-NEXT: mov.b32 %r5, %r1;
+; SM70-NEXT: @%p1 bra $L__BB18_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @nand_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: nand_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b64 %rd<7>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd3, [nand_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM70-NEXT: $L__BB19_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b64 %rd4, %rd6, %rd3;
+; SM70-NEXT: not.b64 %rd5, %rd4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM70-NEXT: setp.ne.b64 %p1, %rd1, %rd6;
+; SM70-NEXT: mov.b64 %rd6, %rd1;
+; SM70-NEXT: @%p1 bra $L__BB19_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @or_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: or_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<8>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [or_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b8 %r1, [or_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd2, %rd1, -4;
+; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM70-NEXT: and.b32 %r3, %r2, 3;
+; SM70-NEXT: shl.b32 %r4, %r3, 3;
+; SM70-NEXT: shl.b32 %r5, %r1, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.or.b32 %r6, [%rd2], %r5;
+; SM70-NEXT: shr.u32 %r7, %r6, %r4;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r7;
+; SM70-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @or_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: or_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<8>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [or_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b16 %r1, [or_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd2, %rd1, -4;
+; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM70-NEXT: and.b32 %r3, %r2, 3;
+; SM70-NEXT: shl.b32 %r4, %r3, 3;
+; SM70-NEXT: shl.b32 %r5, %r1, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.or.b32 %r6, [%rd2], %r5;
+; SM70-NEXT: shr.u32 %r7, %r6, %r4;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r7;
+; SM70-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @or_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: or_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [or_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [or_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.or.b32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @or_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: or_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [or_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.b64 %rd2, [or_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.or.b64 %rd3, [%rd1], %rd2;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @xor_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: xor_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<8>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b8 %r1, [xor_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd2, %rd1, -4;
+; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM70-NEXT: and.b32 %r3, %r2, 3;
+; SM70-NEXT: shl.b32 %r4, %r3, 3;
+; SM70-NEXT: shl.b32 %r5, %r1, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.xor.b32 %r6, [%rd2], %r5;
+; SM70-NEXT: shr.u32 %r7, %r6, %r4;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r7;
+; SM70-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @xor_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: xor_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<8>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b16 %r1, [xor_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd2, %rd1, -4;
+; SM70-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM70-NEXT: and.b32 %r3, %r2, 3;
+; SM70-NEXT: shl.b32 %r4, %r3, 3;
+; SM70-NEXT: shl.b32 %r5, %r1, %r4;
+; SM70-NEXT: atom.relaxed.cta.global.xor.b32 %r6, [%rd2], %r5;
+; SM70-NEXT: shr.u32 %r7, %r6, %r4;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r7;
+; SM70-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @xor_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: xor_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [xor_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.xor.b32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @xor_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: xor_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.b64 %rd2, [xor_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.xor.b64 %rd3, [%rd1], %rd2;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @max_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: max_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<5>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b8 %rs1, [max_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [max_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 255;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: cvt.s16.s8 %rs3, %rs1;
+; SM70-NEXT: $L__BB28_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r15, %r1;
+; SM70-NEXT: cvt.s8.s32 %rs2, %r8;
+; SM70-NEXT: max.s16 %rs4, %rs2, %rs3;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM70-NEXT: and.b32 %r10, %r9, 255;
+; SM70-NEXT: shl.b32 %r11, %r10, %r1;
+; SM70-NEXT: and.b32 %r12, %r15, %r2;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r15;
+; SM70-NEXT: mov.b32 %r15, %r3;
+; SM70-NEXT: @%p1 bra $L__BB28_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @max_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: max_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<4>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [max_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [max_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB29_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: max.s16 %rs3, %rs2, %rs1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p1 bra $L__BB29_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @max_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: max_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [max_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [max_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.max.s32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @max_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: max_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [max_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.b64 %rd2, [max_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.max.s64 %rd3, [%rd1], %rd2;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @min_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: min_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<5>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b8 %rs1, [min_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [min_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 255;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: cvt.s16.s8 %rs3, %rs1;
+; SM70-NEXT: $L__BB32_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r15, %r1;
+; SM70-NEXT: cvt.s8.s32 %rs2, %r8;
+; SM70-NEXT: min.s16 %rs4, %rs2, %rs3;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM70-NEXT: and.b32 %r10, %r9, 255;
+; SM70-NEXT: shl.b32 %r11, %r10, %r1;
+; SM70-NEXT: and.b32 %r12, %r15, %r2;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r15;
+; SM70-NEXT: mov.b32 %r15, %r3;
+; SM70-NEXT: @%p1 bra $L__BB32_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @min_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: min_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<4>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [min_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [min_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB33_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: min.s16 %rs3, %rs2, %rs1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p1 bra $L__BB33_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @min_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: min_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [min_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [min_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.min.s32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @min_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: min_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [min_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.b64 %rd2, [min_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.min.s64 %rd3, [%rd1], %rd2;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @umax_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: umax_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<5>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b8 %rs1, [umax_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 255;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB36_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: and.b16 %rs3, %rs2, 255;
+; SM70-NEXT: max.u16 %rs4, %rs3, %rs1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p1 bra $L__BB36_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @umax_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: umax_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<4>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [umax_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB37_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: max.u16 %rs3, %rs2, %rs1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p1 bra $L__BB37_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @umax_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: umax_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [umax_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [umax_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.max.u32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @umax_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: umax_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [umax_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.max.u64 %rd3, [%rd1], %rd2;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @umin_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: umin_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<5>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b8 %rs1, [umin_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 255;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB40_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: and.b16 %rs3, %rs2, 255;
+; SM70-NEXT: min.u16 %rs4, %rs3, %rs1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p1 bra $L__BB40_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @umin_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: umin_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<4>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [umin_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB41_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: min.u16 %rs3, %rs2, %rs1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p1 bra $L__BB41_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @umin_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: umin_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [umin_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [umin_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.min.u32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @umin_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: umin_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [umin_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.min.u64 %rd3, [%rd1], %rd2;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @uinc_wrap_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: uinc_wrap_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<6>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b8 %rs1, [uinc_wrap_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 255;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: $L__BB44_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r15, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: and.b16 %rs3, %rs2, 255;
+; SM70-NEXT: add.s16 %rs4, %rs2, 1;
+; SM70-NEXT: setp.ge.u16 %p1, %rs3, %rs1;
+; SM70-NEXT: selp.b16 %rs5, 0, %rs4, %p1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM70-NEXT: and.b32 %r10, %r9, 255;
+; SM70-NEXT: shl.b32 %r11, %r10, %r1;
+; SM70-NEXT: and.b32 %r12, %r15, %r2;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p2, %r3, %r15;
+; SM70-NEXT: mov.b32 %r15, %r3;
+; SM70-NEXT: @%p2 bra $L__BB44_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @uinc_wrap_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: uinc_wrap_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<5>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [uinc_wrap_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB45_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: add.s16 %rs3, %rs2, 1;
+; SM70-NEXT: setp.ge.u16 %p1, %rs2, %rs1;
+; SM70-NEXT: selp.b16 %rs4, 0, %rs3, %p1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p2, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p2 bra $L__BB45_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @uinc_wrap_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: uinc_wrap_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [uinc_wrap_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [uinc_wrap_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.inc.u32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @uinc_wrap_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: uinc_wrap_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b64 %rd<7>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd3, [uinc_wrap_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM70-NEXT: $L__BB47_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: add.s64 %rd4, %rd6, 1;
+; SM70-NEXT: setp.ge.u64 %p1, %rd6, %rd3;
+; SM70-NEXT: selp.b64 %rd5, 0, %rd4, %p1;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM70-NEXT: setp.ne.b64 %p2, %rd1, %rd6;
+; SM70-NEXT: mov.b64 %rd6, %rd1;
+; SM70-NEXT: @%p2 bra $L__BB47_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @udec_wrap_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: udec_wrap_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<4>;
+; SM70-NEXT: .reg .b16 %rs<7>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b8 %rs1, [udec_wrap_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 255;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: $L__BB48_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r15, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: and.b16 %rs3, %rs2, 255;
+; SM70-NEXT: add.s16 %rs4, %rs2, -1;
+; SM70-NEXT: setp.eq.b16 %p1, %rs3, 0;
+; SM70-NEXT: setp.gt.u16 %p2, %rs3, %rs1;
+; SM70-NEXT: selp.b16 %rs5, %rs1, %rs4, %p2;
+; SM70-NEXT: selp.b16 %rs6, %rs1, %rs5, %p1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs6;
+; SM70-NEXT: and.b32 %r10, %r9, 255;
+; SM70-NEXT: shl.b32 %r11, %r10, %r1;
+; SM70-NEXT: and.b32 %r12, %r15, %r2;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p3, %r3, %r15;
+; SM70-NEXT: mov.b32 %r15, %r3;
+; SM70-NEXT: @%p3 bra $L__BB48_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @udec_wrap_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: udec_wrap_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<4>;
+; SM70-NEXT: .reg .b16 %rs<6>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [udec_wrap_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB49_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: add.s16 %rs3, %rs2, -1;
+; SM70-NEXT: setp.eq.b16 %p1, %rs2, 0;
+; SM70-NEXT: setp.gt.u16 %p2, %rs2, %rs1;
+; SM70-NEXT: selp.b16 %rs4, %rs1, %rs3, %p2;
+; SM70-NEXT: selp.b16 %rs5, %rs1, %rs4, %p1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p3, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p3 bra $L__BB49_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @udec_wrap_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: udec_wrap_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [udec_wrap_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [udec_wrap_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.dec.u32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @udec_wrap_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: udec_wrap_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<4>;
+; SM70-NEXT: .reg .b64 %rd<8>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd3, [udec_wrap_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b64 %rd7, [%rd2];
+; SM70-NEXT: $L__BB51_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: add.s64 %rd4, %rd7, -1;
+; SM70-NEXT: setp.eq.b64 %p1, %rd7, 0;
+; SM70-NEXT: setp.gt.u64 %p2, %rd7, %rd3;
+; SM70-NEXT: selp.b64 %rd5, %rd3, %rd4, %p2;
+; SM70-NEXT: selp.b64 %rd6, %rd3, %rd5, %p1;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd7, %rd6;
+; SM70-NEXT: setp.ne.b64 %p3, %rd1, %rd7;
+; SM70-NEXT: mov.b64 %rd7, %rd1;
+; SM70-NEXT: @%p3 bra $L__BB51_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @usub_cond_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: usub_cond_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<6>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b8 %rs1, [usub_cond_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 255;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: $L__BB52_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r15, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: and.b16 %rs3, %rs2, 255;
+; SM70-NEXT: setp.ge.u16 %p1, %rs3, %rs1;
+; SM70-NEXT: sub.s16 %rs4, %rs2, %rs1;
+; SM70-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM70-NEXT: and.b32 %r10, %r9, 255;
+; SM70-NEXT: shl.b32 %r11, %r10, %r1;
+; SM70-NEXT: and.b32 %r12, %r15, %r2;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p2, %r3, %r15;
+; SM70-NEXT: mov.b32 %r15, %r3;
+; SM70-NEXT: @%p2 bra $L__BB52_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @usub_cond_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: usub_cond_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<5>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [usub_cond_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB53_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: setp.ge.u16 %p1, %rs2, %rs1;
+; SM70-NEXT: sub.s16 %rs3, %rs2, %rs1;
+; SM70-NEXT: selp.b16 %rs4, %rs3, %rs2, %p1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p2, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p2 bra $L__BB53_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @usub_cond_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: usub_cond_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b32 %r<6>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [usub_cond_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [usub_cond_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b32 %r5, [%rd1];
+; SM70-NEXT: $L__BB54_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: setp.ge.u32 %p1, %r5, %r2;
+; SM70-NEXT: sub.s32 %r3, %r5, %r2;
+; SM70-NEXT: selp.b32 %r4, %r3, %r5, %p1;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM70-NEXT: setp.ne.b32 %p2, %r1, %r5;
+; SM70-NEXT: mov.b32 %r5, %r1;
+; SM70-NEXT: @%p2 bra $L__BB54_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @usub_cond_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: usub_cond_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b64 %rd<7>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd3, [usub_cond_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM70-NEXT: $L__BB55_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: setp.ge.u64 %p1, %rd6, %rd3;
+; SM70-NEXT: sub.s64 %rd4, %rd6, %rd3;
+; SM70-NEXT: selp.b64 %rd5, %rd4, %rd6, %p1;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM70-NEXT: setp.ne.b64 %p2, %rd1, %rd6;
+; SM70-NEXT: mov.b64 %rd6, %rd1;
+; SM70-NEXT: @%p2 bra $L__BB55_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @usub_sat_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: usub_sat_acq_rel_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<6>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b8 %rs1, [usub_sat_acq_rel_i8_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 255;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB56_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: and.b16 %rs3, %rs2, 255;
+; SM70-NEXT: max.u16 %rs4, %rs3, %rs1;
+; SM70-NEXT: sub.s16 %rs5, %rs4, %rs1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p1 bra $L__BB56_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @usub_sat_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM70-LABEL: usub_sat_acq_rel_i16_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<5>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [usub_sat_acq_rel_i16_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i16_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB57_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: max.u16 %rs3, %rs2, %rs1;
+; SM70-NEXT: sub.s16 %rs4, %rs3, %rs1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p1 bra $L__BB57_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @usub_sat_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: usub_sat_acq_rel_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<6>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [usub_sat_acq_rel_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [usub_sat_acq_rel_i32_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b32 %r5, [%rd1];
+; SM70-NEXT: $L__BB58_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: max.u32 %r3, %r5, %r2;
+; SM70-NEXT: sub.s32 %r4, %r3, %r2;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM70-NEXT: mov.b32 %r5, %r1;
+; SM70-NEXT: @%p1 bra $L__BB58_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @usub_sat_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM70-LABEL: usub_sat_acq_rel_i64_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b64 %rd<7>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd3, [usub_sat_acq_rel_i64_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i64_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM70-NEXT: $L__BB59_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: max.u64 %rd4, %rd6, %rd3;
+; SM70-NEXT: sub.s64 %rd5, %rd4, %rd3;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM70-NEXT: setp.ne.b64 %p1, %rd1, %rd6;
+; SM70-NEXT: mov.b64 %rd6, %rd1;
+; SM70-NEXT: @%p1 bra $L__BB59_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define float @fadd_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM70-LABEL: fadd_acq_rel_float_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_float_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [fadd_acq_rel_float_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.add.f32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fsub_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM70-LABEL: fsub_acq_rel_float_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<5>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [fsub_acq_rel_float_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [fsub_acq_rel_float_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b32 %r4, [%rd1];
+; SM70-NEXT: $L__BB61_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: sub.rn.f32 %r3, %r4, %r2;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
+; SM70-NEXT: setp.ne.b32 %p1, %r1, %r4;
+; SM70-NEXT: mov.b32 %r4, %r1;
+; SM70-NEXT: @%p1 bra $L__BB61_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fmin_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM70-LABEL: fmin_acq_rel_float_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<5>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [fmin_acq_rel_float_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [fmin_acq_rel_float_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b32 %r4, [%rd1];
+; SM70-NEXT: $L__BB62_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: min.f32 %r3, %r4, %r2;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
+; SM70-NEXT: setp.ne.b32 %p1, %r1, %r4;
+; SM70-NEXT: mov.b32 %r4, %r1;
+; SM70-NEXT: @%p1 bra $L__BB62_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fmax_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM70-LABEL: fmax_acq_rel_float_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<5>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [fmax_acq_rel_float_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [fmax_acq_rel_float_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b32 %r4, [%rd1];
+; SM70-NEXT: $L__BB63_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: max.f32 %r3, %r4, %r2;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
+; SM70-NEXT: setp.ne.b32 %p1, %r1, %r4;
+; SM70-NEXT: mov.b32 %r4, %r1;
+; SM70-NEXT: @%p1 bra $L__BB63_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fminimum_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM70-LABEL: fminimum_acq_rel_float_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<6>;
+; SM70-NEXT: .reg .b32 %r<9>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [fminimum_acq_rel_float_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [fminimum_acq_rel_float_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b32 %r8, [%rd1];
+; SM70-NEXT: setp.eq.b32 %p3, %r2, -2147483648;
+; SM70-NEXT: $L__BB64_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: setp.nan.f32 %p1, %r8, %r2;
+; SM70-NEXT: min.f32 %r3, %r8, %r2;
+; SM70-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
+; SM70-NEXT: setp.eq.b32 %p2, %r8, -2147483648;
+; SM70-NEXT: selp.f32 %r5, %r8, %r4, %p2;
+; SM70-NEXT: selp.f32 %r6, %r2, %r5, %p3;
+; SM70-NEXT: setp.eq.f32 %p4, %r4, 0f00000000;
+; SM70-NEXT: selp.f32 %r7, %r6, %r4, %p4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r8, %r7;
+; SM70-NEXT: setp.ne.b32 %p5, %r1, %r8;
+; SM70-NEXT: mov.b32 %r8, %r1;
+; SM70-NEXT: @%p5 bra $L__BB64_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fmaximum_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM70-LABEL: fmaximum_acq_rel_float_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<6>;
+; SM70-NEXT: .reg .b32 %r<9>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [fmaximum_acq_rel_float_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [fmaximum_acq_rel_float_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b32 %r8, [%rd1];
+; SM70-NEXT: setp.eq.b32 %p3, %r2, 0;
+; SM70-NEXT: $L__BB65_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: setp.nan.f32 %p1, %r8, %r2;
+; SM70-NEXT: max.f32 %r3, %r8, %r2;
+; SM70-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1;
+; SM70-NEXT: setp.eq.b32 %p2, %r8, 0;
+; SM70-NEXT: selp.f32 %r5, %r8, %r4, %p2;
+; SM70-NEXT: selp.f32 %r6, %r2, %r5, %p3;
+; SM70-NEXT: setp.eq.f32 %p4, %r4, 0f00000000;
+; SM70-NEXT: selp.f32 %r7, %r6, %r4, %p4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r8, %r7;
+; SM70-NEXT: setp.ne.b32 %p5, %r1, %r8;
+; SM70-NEXT: mov.b32 %r8, %r1;
+; SM70-NEXT: @%p5 bra $L__BB65_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define double @fadd_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM70-LABEL: fadd_acq_rel_double_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b64 %rd<4>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_double_global_cta_param_0];
+; SM70-NEXT: ld.param.b64 %rd2, [fadd_acq_rel_double_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.add.f64 %rd3, [%rd1], %rd2;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fsub_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM70-LABEL: fsub_acq_rel_double_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b64 %rd<6>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd3, [fsub_acq_rel_double_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_double_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b64 %rd5, [%rd2];
+; SM70-NEXT: $L__BB67_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: sub.rn.f64 %rd4, %rd5, %rd3;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
+; SM70-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
+; SM70-NEXT: mov.b64 %rd5, %rd1;
+; SM70-NEXT: @%p1 bra $L__BB67_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fmin_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM70-LABEL: fmin_acq_rel_double_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b64 %rd<6>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd3, [fmin_acq_rel_double_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_double_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b64 %rd5, [%rd2];
+; SM70-NEXT: $L__BB68_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: min.f64 %rd4, %rd5, %rd3;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
+; SM70-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
+; SM70-NEXT: mov.b64 %rd5, %rd1;
+; SM70-NEXT: @%p1 bra $L__BB68_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fmax_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM70-LABEL: fmax_acq_rel_double_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b64 %rd<6>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd3, [fmax_acq_rel_double_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_double_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b64 %rd5, [%rd2];
+; SM70-NEXT: $L__BB69_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: max.f64 %rd4, %rd5, %rd3;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
+; SM70-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
+; SM70-NEXT: mov.b64 %rd5, %rd1;
+; SM70-NEXT: @%p1 bra $L__BB69_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fminimum_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM70-LABEL: fminimum_acq_rel_double_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<6>;
+; SM70-NEXT: .reg .b64 %rd<10>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd3, [fminimum_acq_rel_double_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_double_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b64 %rd9, [%rd2];
+; SM70-NEXT: setp.eq.b64 %p3, %rd3, -9223372036854775808;
+; SM70-NEXT: $L__BB70_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: setp.nan.f64 %p1, %rd9, %rd3;
+; SM70-NEXT: min.f64 %rd4, %rd9, %rd3;
+; SM70-NEXT: selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
+; SM70-NEXT: setp.eq.b64 %p2, %rd9, -9223372036854775808;
+; SM70-NEXT: selp.f64 %rd6, %rd9, %rd5, %p2;
+; SM70-NEXT: selp.f64 %rd7, %rd3, %rd6, %p3;
+; SM70-NEXT: setp.eq.f64 %p4, %rd5, 0d0000000000000000;
+; SM70-NEXT: selp.f64 %rd8, %rd7, %rd5, %p4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd9, %rd8;
+; SM70-NEXT: setp.ne.b64 %p5, %rd1, %rd9;
+; SM70-NEXT: mov.b64 %rd9, %rd1;
+; SM70-NEXT: @%p5 bra $L__BB70_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fmaximum_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM70-LABEL: fmaximum_acq_rel_double_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<6>;
+; SM70-NEXT: .reg .b64 %rd<10>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd3, [fmaximum_acq_rel_double_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_double_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b64 %rd9, [%rd2];
+; SM70-NEXT: setp.eq.b64 %p3, %rd3, 0;
+; SM70-NEXT: $L__BB71_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: setp.nan.f64 %p1, %rd9, %rd3;
+; SM70-NEXT: max.f64 %rd4, %rd9, %rd3;
+; SM70-NEXT: selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
+; SM70-NEXT: setp.eq.b64 %p2, %rd9, 0;
+; SM70-NEXT: selp.f64 %rd6, %rd9, %rd5, %p2;
+; SM70-NEXT: selp.f64 %rd7, %rd3, %rd6, %p3;
+; SM70-NEXT: setp.eq.f64 %p4, %rd5, 0d0000000000000000;
+; SM70-NEXT: selp.f64 %rd8, %rd7, %rd5, %p4;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd9, %rd8;
+; SM70-NEXT: setp.ne.b64 %p5, %rd1, %rd9;
+; SM70-NEXT: mov.b64 %rd9, %rd1;
+; SM70-NEXT: @%p5 bra $L__BB71_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define half @fadd_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM70-LABEL: fadd_acq_rel_half_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b16 %rs<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_half_global_cta_param_0];
+; SM70-NEXT: ld.param.b16 %rs1, [fadd_acq_rel_half_global_cta_param_1];
+; SM70-NEXT: atom.acq_rel.cta.global.add.noftz.f16 %rs2, [%rd1], %rs1;
+; SM70-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fsub_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM70-LABEL: fsub_acq_rel_half_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<4>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [fsub_acq_rel_half_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_half_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: $L__BB73_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: sub.rn.f16 %rs3, %rs2, %rs1;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p1 bra $L__BB73_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b16 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fmin_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM70-LABEL: fmin_acq_rel_half_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<4>;
+; SM70-NEXT: .reg .b32 %r<18>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [fmin_acq_rel_half_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_half_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r17, [%rd1];
+; SM70-NEXT: cvt.f32.f16 %r10, %rs1;
+; SM70-NEXT: $L__BB74_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r17, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: cvt.f32.f16 %r9, %rs2;
+; SM70-NEXT: min.f32 %r11, %r9, %r10;
+; SM70-NEXT: cvt.rn.f16.f32 %rs3, %r11;
+; SM70-NEXT: cvt.u32.u16 %r12, %rs3;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: and.b32 %r14, %r17, %r2;
+; SM70-NEXT: or.b32 %r15, %r14, %r13;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r17, %r15;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r17;
+; SM70-NEXT: mov.b32 %r17, %r3;
+; SM70-NEXT: @%p1 bra $L__BB74_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r16, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b16 [func_retval0], %r16;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fmax_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM70-LABEL: fmax_acq_rel_half_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b16 %rs<4>;
+; SM70-NEXT: .reg .b32 %r<18>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [fmax_acq_rel_half_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_half_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r17, [%rd1];
+; SM70-NEXT: cvt.f32.f16 %r10, %rs1;
+; SM70-NEXT: $L__BB75_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r17, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: cvt.f32.f16 %r9, %rs2;
+; SM70-NEXT: max.f32 %r11, %r9, %r10;
+; SM70-NEXT: cvt.rn.f16.f32 %rs3, %r11;
+; SM70-NEXT: cvt.u32.u16 %r12, %rs3;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: and.b32 %r14, %r17, %r2;
+; SM70-NEXT: or.b32 %r15, %r14, %r13;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r17, %r15;
+; SM70-NEXT: setp.ne.b32 %p1, %r3, %r17;
+; SM70-NEXT: mov.b32 %r17, %r3;
+; SM70-NEXT: @%p1 bra $L__BB75_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r16, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b16 [func_retval0], %r16;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fminimum_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM70-LABEL: fminimum_acq_rel_half_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<7>;
+; SM70-NEXT: .reg .b16 %rs<9>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [fminimum_acq_rel_half_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_half_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: setp.eq.b16 %p4, %rs1, -32768;
+; SM70-NEXT: $L__BB76_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: setp.lt.f16 %p1, %rs2, %rs1;
+; SM70-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
+; SM70-NEXT: setp.nan.f16 %p2, %rs2, %rs1;
+; SM70-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2;
+; SM70-NEXT: setp.eq.b16 %p3, %rs2, -32768;
+; SM70-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
+; SM70-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
+; SM70-NEXT: mov.b16 %rs7, 0x0000;
+; SM70-NEXT: setp.eq.f16 %p5, %rs4, %rs7;
+; SM70-NEXT: selp.b16 %rs8, %rs6, %rs4, %p5;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs8;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p6, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p6 bra $L__BB76_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b16 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fmaximum_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM70-LABEL: fmaximum_acq_rel_half_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<7>;
+; SM70-NEXT: .reg .b16 %rs<9>;
+; SM70-NEXT: .reg .b32 %r<15>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [fmaximum_acq_rel_half_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_half_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r14, [%rd1];
+; SM70-NEXT: setp.eq.b16 %p4, %rs1, 0;
+; SM70-NEXT: $L__BB77_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r14, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: setp.gt.f16 %p1, %rs2, %rs1;
+; SM70-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
+; SM70-NEXT: setp.nan.f16 %p2, %rs2, %rs1;
+; SM70-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2;
+; SM70-NEXT: setp.eq.b16 %p3, %rs2, 0;
+; SM70-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
+; SM70-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
+; SM70-NEXT: mov.b16 %rs7, 0x0000;
+; SM70-NEXT: setp.eq.f16 %p5, %rs4, %rs7;
+; SM70-NEXT: selp.b16 %rs8, %rs6, %rs4, %p5;
+; SM70-NEXT: cvt.u32.u16 %r9, %rs8;
+; SM70-NEXT: shl.b32 %r10, %r9, %r1;
+; SM70-NEXT: and.b32 %r11, %r14, %r2;
+; SM70-NEXT: or.b32 %r12, %r11, %r10;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM70-NEXT: setp.ne.b32 %p6, %r3, %r14;
+; SM70-NEXT: mov.b32 %r14, %r3;
+; SM70-NEXT: @%p6 bra $L__BB77_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r13, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b16 [func_retval0], %r13;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define bfloat @fadd_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM70-LABEL: fadd_acq_rel_bfloat_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<24>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [fadd_acq_rel_bfloat_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fadd_acq_rel_bfloat_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r23, [%rd1];
+; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM70-NEXT: shl.b32 %r11, %r10, 16;
+; SM70-NEXT: $L__BB78_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r23, %r1;
+; SM70-NEXT: shl.b32 %r9, %r8, 16;
+; SM70-NEXT: add.rn.f32 %r12, %r9, %r11;
+; SM70-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; SM70-NEXT: add.s32 %r14, %r13, %r12;
+; SM70-NEXT: add.s32 %r15, %r14, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r12, %r12;
+; SM70-NEXT: or.b32 %r16, %r12, 4194304;
+; SM70-NEXT: selp.b32 %r17, %r16, %r15, %p1;
+; SM70-NEXT: shr.u32 %r18, %r17, 16;
+; SM70-NEXT: shl.b32 %r19, %r18, %r1;
+; SM70-NEXT: and.b32 %r20, %r23, %r2;
+; SM70-NEXT: or.b32 %r21, %r20, %r19;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
+; SM70-NEXT: setp.ne.b32 %p2, %r3, %r23;
+; SM70-NEXT: mov.b32 %r23, %r3;
+; SM70-NEXT: @%p2 bra $L__BB78_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r22, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b16 [func_retval0], %r22;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fsub_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM70-LABEL: fsub_acq_rel_bfloat_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<24>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [fsub_acq_rel_bfloat_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_bfloat_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r23, [%rd1];
+; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM70-NEXT: shl.b32 %r11, %r10, 16;
+; SM70-NEXT: $L__BB79_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r23, %r1;
+; SM70-NEXT: shl.b32 %r9, %r8, 16;
+; SM70-NEXT: sub.rn.f32 %r12, %r9, %r11;
+; SM70-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; SM70-NEXT: add.s32 %r14, %r13, %r12;
+; SM70-NEXT: add.s32 %r15, %r14, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r12, %r12;
+; SM70-NEXT: or.b32 %r16, %r12, 4194304;
+; SM70-NEXT: selp.b32 %r17, %r16, %r15, %p1;
+; SM70-NEXT: shr.u32 %r18, %r17, 16;
+; SM70-NEXT: shl.b32 %r19, %r18, %r1;
+; SM70-NEXT: and.b32 %r20, %r23, %r2;
+; SM70-NEXT: or.b32 %r21, %r20, %r19;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
+; SM70-NEXT: setp.ne.b32 %p2, %r3, %r23;
+; SM70-NEXT: mov.b32 %r23, %r3;
+; SM70-NEXT: @%p2 bra $L__BB79_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r22, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b16 [func_retval0], %r22;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fmin_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM70-LABEL: fmin_acq_rel_bfloat_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<24>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [fmin_acq_rel_bfloat_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_bfloat_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r23, [%rd1];
+; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM70-NEXT: shl.b32 %r11, %r10, 16;
+; SM70-NEXT: $L__BB80_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r23, %r1;
+; SM70-NEXT: shl.b32 %r9, %r8, 16;
+; SM70-NEXT: min.f32 %r12, %r9, %r11;
+; SM70-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; SM70-NEXT: add.s32 %r14, %r13, %r12;
+; SM70-NEXT: add.s32 %r15, %r14, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r12, %r12;
+; SM70-NEXT: or.b32 %r16, %r12, 4194304;
+; SM70-NEXT: selp.b32 %r17, %r16, %r15, %p1;
+; SM70-NEXT: shr.u32 %r18, %r17, 16;
+; SM70-NEXT: shl.b32 %r19, %r18, %r1;
+; SM70-NEXT: and.b32 %r20, %r23, %r2;
+; SM70-NEXT: or.b32 %r21, %r20, %r19;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
+; SM70-NEXT: setp.ne.b32 %p2, %r3, %r23;
+; SM70-NEXT: mov.b32 %r23, %r3;
+; SM70-NEXT: @%p2 bra $L__BB80_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r22, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b16 [func_retval0], %r22;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fmax_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM70-LABEL: fmax_acq_rel_bfloat_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<24>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [fmax_acq_rel_bfloat_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_bfloat_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r23, [%rd1];
+; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM70-NEXT: shl.b32 %r11, %r10, 16;
+; SM70-NEXT: $L__BB81_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r23, %r1;
+; SM70-NEXT: shl.b32 %r9, %r8, 16;
+; SM70-NEXT: max.f32 %r12, %r9, %r11;
+; SM70-NEXT: bfe.u32 %r13, %r12, 16, 1;
+; SM70-NEXT: add.s32 %r14, %r13, %r12;
+; SM70-NEXT: add.s32 %r15, %r14, 32767;
+; SM70-NEXT: setp.nan.f32 %p1, %r12, %r12;
+; SM70-NEXT: or.b32 %r16, %r12, 4194304;
+; SM70-NEXT: selp.b32 %r17, %r16, %r15, %p1;
+; SM70-NEXT: shr.u32 %r18, %r17, 16;
+; SM70-NEXT: shl.b32 %r19, %r18, %r1;
+; SM70-NEXT: and.b32 %r20, %r23, %r2;
+; SM70-NEXT: or.b32 %r21, %r20, %r19;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r23, %r21;
+; SM70-NEXT: setp.ne.b32 %p2, %r3, %r23;
+; SM70-NEXT: mov.b32 %r23, %r3;
+; SM70-NEXT: @%p2 bra $L__BB81_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r22, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b16 [func_retval0], %r22;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fminimum_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM70-LABEL: fminimum_acq_rel_bfloat_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<7>;
+; SM70-NEXT: .reg .b16 %rs<8>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [fminimum_acq_rel_bfloat_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_bfloat_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r19, [%rd1];
+; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM70-NEXT: shl.b32 %r11, %r10, 16;
+; SM70-NEXT: setp.eq.b16 %p4, %rs1, -32768;
+; SM70-NEXT: $L__BB82_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r19, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: shl.b32 %r9, %r8, 16;
+; SM70-NEXT: setp.lt.f32 %p1, %r9, %r11;
+; SM70-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
+; SM70-NEXT: setp.nan.f32 %p2, %r9, %r11;
+; SM70-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2;
+; SM70-NEXT: setp.eq.b16 %p3, %rs2, -32768;
+; SM70-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
+; SM70-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
+; SM70-NEXT: cvt.u32.u16 %r12, %rs4;
+; SM70-NEXT: shl.b32 %r13, %r12, 16;
+; SM70-NEXT: setp.eq.f32 %p5, %r13, 0f00000000;
+; SM70-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs7;
+; SM70-NEXT: shl.b32 %r15, %r14, %r1;
+; SM70-NEXT: and.b32 %r16, %r19, %r2;
+; SM70-NEXT: or.b32 %r17, %r16, %r15;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r19, %r17;
+; SM70-NEXT: setp.ne.b32 %p6, %r3, %r19;
+; SM70-NEXT: mov.b32 %r19, %r3;
+; SM70-NEXT: @%p6 bra $L__BB82_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r18, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b16 [func_retval0], %r18;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fmaximum_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM70-LABEL: fmaximum_acq_rel_bfloat_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<7>;
+; SM70-NEXT: .reg .b16 %rs<8>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b16 %rs1, [fmaximum_acq_rel_bfloat_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_bfloat_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM70-NEXT: and.b32 %r5, %r4, 3;
+; SM70-NEXT: shl.b32 %r1, %r5, 3;
+; SM70-NEXT: mov.b32 %r6, 65535;
+; SM70-NEXT: shl.b32 %r7, %r6, %r1;
+; SM70-NEXT: not.b32 %r2, %r7;
+; SM70-NEXT: ld.global.b32 %r19, [%rd1];
+; SM70-NEXT: cvt.u32.u16 %r10, %rs1;
+; SM70-NEXT: shl.b32 %r11, %r10, 16;
+; SM70-NEXT: setp.eq.b16 %p4, %rs1, 0;
+; SM70-NEXT: $L__BB83_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: shr.u32 %r8, %r19, %r1;
+; SM70-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM70-NEXT: shl.b32 %r9, %r8, 16;
+; SM70-NEXT: setp.gt.f32 %p1, %r9, %r11;
+; SM70-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1;
+; SM70-NEXT: setp.nan.f32 %p2, %r9, %r11;
+; SM70-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2;
+; SM70-NEXT: setp.eq.b16 %p3, %rs2, 0;
+; SM70-NEXT: selp.b16 %rs5, %rs2, %rs4, %p3;
+; SM70-NEXT: selp.b16 %rs6, %rs1, %rs5, %p4;
+; SM70-NEXT: cvt.u32.u16 %r12, %rs4;
+; SM70-NEXT: shl.b32 %r13, %r12, 16;
+; SM70-NEXT: setp.eq.f32 %p5, %r13, 0f00000000;
+; SM70-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs7;
+; SM70-NEXT: shl.b32 %r15, %r14, %r1;
+; SM70-NEXT: and.b32 %r16, %r19, %r2;
+; SM70-NEXT: or.b32 %r17, %r16, %r15;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r19, %r17;
+; SM70-NEXT: setp.ne.b32 %p6, %r3, %r19;
+; SM70-NEXT: mov.b32 %r19, %r3;
+; SM70-NEXT: @%p6 bra $L__BB83_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r18, %r3, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b16 [func_retval0], %r18;
+; SM70-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define i8 @add_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: add_monotonic_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [add_monotonic_i8_global_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.b8 %r6, [add_monotonic_i8_global_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 255;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: $L__BB84_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: add.s32 %r10, %r15, %r4;
+; SM70-NEXT: and.b32 %r11, %r10, %r2;
+; SM70-NEXT: and.b32 %r12, %r15, %r3;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM70-NEXT: mov.b32 %r15, %r5;
+; SM70-NEXT: @%p1 bra $L__BB84_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r5, %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") monotonic
+ ret i8 %retval
+}
+
+define i8 @add_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: add_acquire_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [add_acquire_i8_global_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.b8 %r6, [add_acquire_i8_global_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 255;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: $L__BB85_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: add.s32 %r10, %r15, %r4;
+; SM70-NEXT: and.b32 %r11, %r10, %r2;
+; SM70-NEXT: and.b32 %r12, %r15, %r3;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM70-NEXT: mov.b32 %r15, %r5;
+; SM70-NEXT: @%p1 bra $L__BB85_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r5, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") acquire
+ ret i8 %retval
+}
+
+define i8 @add_release_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: add_release_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [add_release_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b8 %r6, [add_release_i8_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 255;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: $L__BB86_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: add.s32 %r10, %r15, %r4;
+; SM70-NEXT: and.b32 %r11, %r10, %r2;
+; SM70-NEXT: and.b32 %r12, %r15, %r3;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM70-NEXT: mov.b32 %r15, %r5;
+; SM70-NEXT: @%p1 bra $L__BB86_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r5, %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") release
+ ret i8 %retval
+}
+
+define i8 @add_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: add_seq_cst_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<16>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [add_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.b8 %r6, [add_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 255;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r15, [%rd1];
+; SM70-NEXT: $L__BB87_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: add.s32 %r10, %r15, %r4;
+; SM70-NEXT: and.b32 %r11, %r10, %r2;
+; SM70-NEXT: and.b32 %r12, %r15, %r3;
+; SM70-NEXT: or.b32 %r13, %r12, %r11;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM70-NEXT: mov.b32 %r15, %r5;
+; SM70-NEXT: @%p1 bra $L__BB87_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r14, %r5, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") seq_cst
+ ret i8 %retval
+}
+
+define i32 @add_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: add_monotonic_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [add_monotonic_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [add_monotonic_i32_global_cta_param_1];
+; SM70-NEXT: atom.relaxed.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") monotonic
+ ret i32 %retval
+}
+
+define i32 @add_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: add_acquire_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [add_acquire_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [add_acquire_i32_global_cta_param_1];
+; SM70-NEXT: atom.acquire.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") acquire
+ ret i32 %retval
+}
+
+define i32 @add_release_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: add_release_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [add_release_i32_global_cta_param_0];
+; SM70-NEXT: ld.param.b32 %r1, [add_release_i32_global_cta_param_1];
+; SM70-NEXT: atom.release.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") release
+ ret i32 %retval
+}
+
+define i32 @add_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: add_seq_cst_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .b32 %r<3>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd1, [add_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.b32 %r1, [add_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT: atom.acquire.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") seq_cst
+ ret i32 %retval
+}
+
+define i8 @nand_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: nand_monotonic_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<17>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [nand_monotonic_i8_global_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.b8 %r6, [nand_monotonic_i8_global_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 255;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r16, [%rd1];
+; SM70-NEXT: $L__BB92_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r10, %r16, %r4;
+; SM70-NEXT: not.b32 %r11, %r10;
+; SM70-NEXT: and.b32 %r12, %r11, %r2;
+; SM70-NEXT: and.b32 %r13, %r16, %r3;
+; SM70-NEXT: or.b32 %r14, %r13, %r12;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM70-NEXT: mov.b32 %r16, %r5;
+; SM70-NEXT: @%p1 bra $L__BB92_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r15, %r5, %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r15;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") monotonic
+ ret i8 %retval
+}
+
+define i8 @nand_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: nand_acquire_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<17>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [nand_acquire_i8_global_cta_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.b8 %r6, [nand_acquire_i8_global_cta_param_1];
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 255;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r16, [%rd1];
+; SM70-NEXT: $L__BB93_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r10, %r16, %r4;
+; SM70-NEXT: not.b32 %r11, %r10;
+; SM70-NEXT: and.b32 %r12, %r11, %r2;
+; SM70-NEXT: and.b32 %r13, %r16, %r3;
+; SM70-NEXT: or.b32 %r14, %r13, %r12;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM70-NEXT: mov.b32 %r16, %r5;
+; SM70-NEXT: @%p1 bra $L__BB93_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r15, %r5, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r15;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") acquire
+ ret i8 %retval
+}
+
+define i8 @nand_release_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: nand_release_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<17>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [nand_release_i8_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.param.b8 %r6, [nand_release_i8_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 255;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r16, [%rd1];
+; SM70-NEXT: $L__BB94_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r10, %r16, %r4;
+; SM70-NEXT: not.b32 %r11, %r10;
+; SM70-NEXT: and.b32 %r12, %r11, %r2;
+; SM70-NEXT: and.b32 %r13, %r16, %r3;
+; SM70-NEXT: or.b32 %r14, %r13, %r12;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM70-NEXT: mov.b32 %r16, %r5;
+; SM70-NEXT: @%p1 bra $L__BB94_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r15, %r5, %r1;
+; SM70-NEXT: st.param.b32 [func_retval0], %r15;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") release
+ ret i8 %retval
+}
+
+define i8 @nand_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM70-LABEL: nand_seq_cst_i8_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<17>;
+; SM70-NEXT: .reg .b64 %rd<3>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b64 %rd2, [nand_seq_cst_i8_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.param.b8 %r6, [nand_seq_cst_i8_global_cta_param_1];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM70-NEXT: and.b32 %r8, %r7, 3;
+; SM70-NEXT: shl.b32 %r1, %r8, 3;
+; SM70-NEXT: mov.b32 %r9, 255;
+; SM70-NEXT: shl.b32 %r2, %r9, %r1;
+; SM70-NEXT: not.b32 %r3, %r2;
+; SM70-NEXT: shl.b32 %r4, %r6, %r1;
+; SM70-NEXT: ld.global.b32 %r16, [%rd1];
+; SM70-NEXT: $L__BB95_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r10, %r16, %r4;
+; SM70-NEXT: not.b32 %r11, %r10;
+; SM70-NEXT: and.b32 %r12, %r11, %r2;
+; SM70-NEXT: and.b32 %r13, %r16, %r3;
+; SM70-NEXT: or.b32 %r14, %r13, %r12;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM70-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM70-NEXT: mov.b32 %r16, %r5;
+; SM70-NEXT: @%p1 bra $L__BB95_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: shr.u32 %r15, %r5, %r1;
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r15;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") seq_cst
+ ret i8 %retval
+}
+
+define i32 @nand_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: nand_monotonic_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<6>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [nand_monotonic_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [nand_monotonic_i32_global_cta_param_0];
+; SM70-NEXT: ld.global.b32 %r5, [%rd1];
+; SM70-NEXT: $L__BB96_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r3, %r5, %r2;
+; SM70-NEXT: not.b32 %r4, %r3;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM70-NEXT: mov.b32 %r5, %r1;
+; SM70-NEXT: @%p1 bra $L__BB96_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") monotonic
+ ret i32 %retval
+}
+
+define i32 @nand_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: nand_acquire_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<6>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [nand_acquire_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [nand_acquire_i32_global_cta_param_0];
+; SM70-NEXT: ld.global.b32 %r5, [%rd1];
+; SM70-NEXT: $L__BB97_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r3, %r5, %r2;
+; SM70-NEXT: not.b32 %r4, %r3;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM70-NEXT: mov.b32 %r5, %r1;
+; SM70-NEXT: @%p1 bra $L__BB97_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") acquire
+ ret i32 %retval
+}
+
+define i32 @nand_release_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: nand_release_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<6>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [nand_release_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [nand_release_i32_global_cta_param_0];
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ld.global.b32 %r5, [%rd1];
+; SM70-NEXT: $L__BB98_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r3, %r5, %r2;
+; SM70-NEXT: not.b32 %r4, %r3;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM70-NEXT: mov.b32 %r5, %r1;
+; SM70-NEXT: @%p1 bra $L__BB98_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") release
+ ret i32 %retval
+}
+
+define i32 @nand_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM70-LABEL: nand_seq_cst_i32_global_cta(
+; SM70: {
+; SM70-NEXT: .reg .pred %p<2>;
+; SM70-NEXT: .reg .b32 %r<6>;
+; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: ld.param.b32 %r2, [nand_seq_cst_i32_global_cta_param_1];
+; SM70-NEXT: ld.param.b64 %rd1, [nand_seq_cst_i32_global_cta_param_0];
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ld.global.b32 %r5, [%rd1];
+; SM70-NEXT: $L__BB99_1: // %atomicrmw.start
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: and.b32 %r3, %r5, %r2;
+; SM70-NEXT: not.b32 %r4, %r3;
+; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM70-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM70-NEXT: mov.b32 %r5, %r1;
+; SM70-NEXT: @%p1 bra $L__BB99_1;
+; SM70-NEXT: // %bb.2: // %atomicrmw.end
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") seq_cst
+ ret i32 %retval
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-sm90.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-sm90.ll
new file mode 100644
index 0000000000000..82680b3b1aeec
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/atomicrmw-sm90.ll
@@ -0,0 +1,2983 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
+
+define i8 @xchg_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: xchg_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<14>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b8 %r5, [xchg_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r6, %rd2;
+; SM90-NEXT: and.b32 %r7, %r6, 3;
+; SM90-NEXT: shl.b32 %r1, %r7, 3;
+; SM90-NEXT: mov.b32 %r8, 255;
+; SM90-NEXT: shl.b32 %r9, %r8, %r1;
+; SM90-NEXT: not.b32 %r2, %r9;
+; SM90-NEXT: shl.b32 %r3, %r5, %r1;
+; SM90-NEXT: ld.global.b32 %r13, [%rd1];
+; SM90-NEXT: $L__BB0_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r10, %r13, %r2;
+; SM90-NEXT: or.b32 %r11, %r10, %r3;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r4, [%rd1], %r13, %r11;
+; SM90-NEXT: setp.ne.b32 %p1, %r4, %r13;
+; SM90-NEXT: mov.b32 %r13, %r4;
+; SM90-NEXT: @%p1 bra $L__BB0_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r12, %r4, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r12;
+; SM90-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @xchg_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: xchg_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<14>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b16 %r5, [xchg_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r6, %rd2;
+; SM90-NEXT: and.b32 %r7, %r6, 3;
+; SM90-NEXT: shl.b32 %r1, %r7, 3;
+; SM90-NEXT: mov.b32 %r8, 65535;
+; SM90-NEXT: shl.b32 %r9, %r8, %r1;
+; SM90-NEXT: not.b32 %r2, %r9;
+; SM90-NEXT: shl.b32 %r3, %r5, %r1;
+; SM90-NEXT: ld.global.b32 %r13, [%rd1];
+; SM90-NEXT: $L__BB1_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r10, %r13, %r2;
+; SM90-NEXT: or.b32 %r11, %r10, %r3;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r4, [%rd1], %r13, %r11;
+; SM90-NEXT: setp.ne.b32 %p1, %r4, %r13;
+; SM90-NEXT: mov.b32 %r13, %r4;
+; SM90-NEXT: @%p1 bra $L__BB1_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r12, %r4, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r12;
+; SM90-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @xchg_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: xchg_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [xchg_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [xchg_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.exch.b32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @xchg_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: xchg_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<4>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [xchg_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.b64 %rd2, [xchg_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.exch.b64 %rd3, [%rd1], %rd2;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %retval = atomicrmw xchg ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @add_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: add_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [add_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b8 %r6, [add_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 255;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: $L__BB4_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: add.s32 %r10, %r15, %r4;
+; SM90-NEXT: and.b32 %r11, %r10, %r2;
+; SM90-NEXT: and.b32 %r12, %r15, %r3;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM90-NEXT: mov.b32 %r15, %r5;
+; SM90-NEXT: @%p1 bra $L__BB4_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r5, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @add_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: add_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [add_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b16 %r6, [add_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 65535;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: $L__BB5_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: add.s32 %r10, %r15, %r4;
+; SM90-NEXT: and.b32 %r11, %r10, %r2;
+; SM90-NEXT: and.b32 %r12, %r15, %r3;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM90-NEXT: mov.b32 %r15, %r5;
+; SM90-NEXT: @%p1 bra $L__BB5_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r5, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @add_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: add_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [add_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [add_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @add_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: add_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<4>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [add_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.b64 %rd2, [add_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.add.u64 %rd3, [%rd1], %rd2;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @sub_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: sub_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b8 %r6, [sub_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 255;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: $L__BB8_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: sub.s32 %r10, %r15, %r4;
+; SM90-NEXT: and.b32 %r11, %r10, %r2;
+; SM90-NEXT: and.b32 %r12, %r15, %r3;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM90-NEXT: mov.b32 %r15, %r5;
+; SM90-NEXT: @%p1 bra $L__BB8_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r5, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @sub_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: sub_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b16 %r6, [sub_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 65535;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: $L__BB9_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: sub.s32 %r10, %r15, %r4;
+; SM90-NEXT: and.b32 %r11, %r10, %r2;
+; SM90-NEXT: and.b32 %r12, %r15, %r3;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM90-NEXT: mov.b32 %r15, %r5;
+; SM90-NEXT: @%p1 bra $L__BB9_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r5, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @sub_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: sub_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<4>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [sub_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [sub_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: neg.s32 %r2, %r1;
+; SM90-NEXT: atom.acq_rel.cta.global.add.u32 %r3, [%rd1], %r2;
+; SM90-NEXT: st.param.b32 [func_retval0], %r3;
+; SM90-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @sub_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: sub_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<5>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [sub_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.b64 %rd2, [sub_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: neg.s64 %rd3, %rd2;
+; SM90-NEXT: atom.acq_rel.cta.global.add.u64 %rd4, [%rd1], %rd3;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd4;
+; SM90-NEXT: ret;
+ %retval = atomicrmw sub ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @and_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: and_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<12>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [and_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b8 %r1, [and_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd2, %rd1, -4;
+; SM90-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM90-NEXT: and.b32 %r3, %r2, 3;
+; SM90-NEXT: shl.b32 %r4, %r3, 3;
+; SM90-NEXT: mov.b32 %r5, 255;
+; SM90-NEXT: shl.b32 %r6, %r5, %r4;
+; SM90-NEXT: not.b32 %r7, %r6;
+; SM90-NEXT: shl.b32 %r8, %r1, %r4;
+; SM90-NEXT: or.b32 %r9, %r8, %r7;
+; SM90-NEXT: atom.relaxed.cta.global.and.b32 %r10, [%rd2], %r9;
+; SM90-NEXT: shr.u32 %r11, %r10, %r4;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r11;
+; SM90-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @and_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: and_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<12>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [and_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b16 %r1, [and_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd2, %rd1, -4;
+; SM90-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM90-NEXT: and.b32 %r3, %r2, 3;
+; SM90-NEXT: shl.b32 %r4, %r3, 3;
+; SM90-NEXT: mov.b32 %r5, 65535;
+; SM90-NEXT: shl.b32 %r6, %r5, %r4;
+; SM90-NEXT: not.b32 %r7, %r6;
+; SM90-NEXT: shl.b32 %r8, %r1, %r4;
+; SM90-NEXT: or.b32 %r9, %r8, %r7;
+; SM90-NEXT: atom.relaxed.cta.global.and.b32 %r10, [%rd2], %r9;
+; SM90-NEXT: shr.u32 %r11, %r10, %r4;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r11;
+; SM90-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @and_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: and_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [and_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [and_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.and.b32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @and_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: and_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<4>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [and_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.b64 %rd2, [and_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.and.b64 %rd3, [%rd1], %rd2;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %retval = atomicrmw and ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @nand_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: nand_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<17>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b8 %r6, [nand_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 255;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r16, [%rd1];
+; SM90-NEXT: $L__BB16_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r10, %r16, %r4;
+; SM90-NEXT: not.b32 %r11, %r10;
+; SM90-NEXT: and.b32 %r12, %r11, %r2;
+; SM90-NEXT: and.b32 %r13, %r16, %r3;
+; SM90-NEXT: or.b32 %r14, %r13, %r12;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM90-NEXT: mov.b32 %r16, %r5;
+; SM90-NEXT: @%p1 bra $L__BB16_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r15, %r5, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r15;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @nand_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: nand_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<17>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b16 %r6, [nand_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 65535;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r16, [%rd1];
+; SM90-NEXT: $L__BB17_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r10, %r16, %r4;
+; SM90-NEXT: not.b32 %r11, %r10;
+; SM90-NEXT: and.b32 %r12, %r11, %r2;
+; SM90-NEXT: and.b32 %r13, %r16, %r3;
+; SM90-NEXT: or.b32 %r14, %r13, %r12;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM90-NEXT: mov.b32 %r16, %r5;
+; SM90-NEXT: @%p1 bra $L__BB17_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r15, %r5, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r15;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @nand_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: nand_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<6>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [nand_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [nand_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b32 %r5, [%rd1];
+; SM90-NEXT: $L__BB18_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r3, %r5, %r2;
+; SM90-NEXT: not.b32 %r4, %r3;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM90-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM90-NEXT: mov.b32 %r5, %r1;
+; SM90-NEXT: @%p1 bra $L__BB18_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @nand_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: nand_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b64 %rd<7>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd3, [nand_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [nand_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM90-NEXT: $L__BB19_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b64 %rd4, %rd6, %rd3;
+; SM90-NEXT: not.b64 %rd5, %rd4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM90-NEXT: setp.ne.b64 %p1, %rd1, %rd6;
+; SM90-NEXT: mov.b64 %rd6, %rd1;
+; SM90-NEXT: @%p1 bra $L__BB19_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @or_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: or_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<8>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [or_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b8 %r1, [or_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd2, %rd1, -4;
+; SM90-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM90-NEXT: and.b32 %r3, %r2, 3;
+; SM90-NEXT: shl.b32 %r4, %r3, 3;
+; SM90-NEXT: shl.b32 %r5, %r1, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.or.b32 %r6, [%rd2], %r5;
+; SM90-NEXT: shr.u32 %r7, %r6, %r4;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r7;
+; SM90-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @or_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: or_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<8>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [or_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b16 %r1, [or_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd2, %rd1, -4;
+; SM90-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM90-NEXT: and.b32 %r3, %r2, 3;
+; SM90-NEXT: shl.b32 %r4, %r3, 3;
+; SM90-NEXT: shl.b32 %r5, %r1, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.or.b32 %r6, [%rd2], %r5;
+; SM90-NEXT: shr.u32 %r7, %r6, %r4;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r7;
+; SM90-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @or_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: or_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [or_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [or_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.or.b32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @or_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: or_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<4>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [or_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.b64 %rd2, [or_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.or.b64 %rd3, [%rd1], %rd2;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %retval = atomicrmw or ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @xor_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: xor_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<8>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b8 %r1, [xor_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd2, %rd1, -4;
+; SM90-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM90-NEXT: and.b32 %r3, %r2, 3;
+; SM90-NEXT: shl.b32 %r4, %r3, 3;
+; SM90-NEXT: shl.b32 %r5, %r1, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.xor.b32 %r6, [%rd2], %r5;
+; SM90-NEXT: shr.u32 %r7, %r6, %r4;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r7;
+; SM90-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @xor_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: xor_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<8>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b16 %r1, [xor_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd2, %rd1, -4;
+; SM90-NEXT: cvt.u32.u64 %r2, %rd1;
+; SM90-NEXT: and.b32 %r3, %r2, 3;
+; SM90-NEXT: shl.b32 %r4, %r3, 3;
+; SM90-NEXT: shl.b32 %r5, %r1, %r4;
+; SM90-NEXT: atom.relaxed.cta.global.xor.b32 %r6, [%rd2], %r5;
+; SM90-NEXT: shr.u32 %r7, %r6, %r4;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r7;
+; SM90-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @xor_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: xor_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [xor_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.xor.b32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @xor_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: xor_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<4>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [xor_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.b64 %rd2, [xor_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.xor.b64 %rd3, [%rd1], %rd2;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %retval = atomicrmw xor ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @max_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: max_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<5>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b8 %rs1, [max_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [max_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 255;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: cvt.s16.s8 %rs3, %rs1;
+; SM90-NEXT: $L__BB28_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r15, %r1;
+; SM90-NEXT: cvt.s8.s32 %rs2, %r8;
+; SM90-NEXT: max.s16 %rs4, %rs2, %rs3;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM90-NEXT: and.b32 %r10, %r9, 255;
+; SM90-NEXT: shl.b32 %r11, %r10, %r1;
+; SM90-NEXT: and.b32 %r12, %r15, %r2;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r15;
+; SM90-NEXT: mov.b32 %r15, %r3;
+; SM90-NEXT: @%p1 bra $L__BB28_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @max_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: max_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [max_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [max_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB29_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: max.s16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB29_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @max_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: max_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [max_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [max_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.max.s32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @max_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: max_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<4>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [max_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.b64 %rd2, [max_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.max.s64 %rd3, [%rd1], %rd2;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %retval = atomicrmw max ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @min_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: min_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<5>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b8 %rs1, [min_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [min_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 255;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: cvt.s16.s8 %rs3, %rs1;
+; SM90-NEXT: $L__BB32_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r15, %r1;
+; SM90-NEXT: cvt.s8.s32 %rs2, %r8;
+; SM90-NEXT: min.s16 %rs4, %rs2, %rs3;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM90-NEXT: and.b32 %r10, %r9, 255;
+; SM90-NEXT: shl.b32 %r11, %r10, %r1;
+; SM90-NEXT: and.b32 %r12, %r15, %r2;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r15;
+; SM90-NEXT: mov.b32 %r15, %r3;
+; SM90-NEXT: @%p1 bra $L__BB32_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @min_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: min_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [min_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [min_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB33_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: min.s16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB33_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @min_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: min_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [min_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [min_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.min.s32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @min_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: min_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<4>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [min_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.b64 %rd2, [min_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.min.s64 %rd3, [%rd1], %rd2;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %retval = atomicrmw min ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @umax_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: umax_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<5>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b8 %rs1, [umax_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 255;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB36_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: and.b16 %rs3, %rs2, 255;
+; SM90-NEXT: max.u16 %rs4, %rs3, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB36_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @umax_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: umax_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [umax_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB37_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: max.u16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB37_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @umax_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: umax_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [umax_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [umax_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.max.u32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @umax_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: umax_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<4>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [umax_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.b64 %rd2, [umax_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.max.u64 %rd3, [%rd1], %rd2;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %retval = atomicrmw umax ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @umin_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: umin_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<5>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b8 %rs1, [umin_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 255;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB40_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: and.b16 %rs3, %rs2, 255;
+; SM90-NEXT: min.u16 %rs4, %rs3, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB40_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @umin_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: umin_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [umin_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB41_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: min.u16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB41_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @umin_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: umin_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [umin_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [umin_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.min.u32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @umin_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: umin_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<4>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [umin_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: ld.param.b64 %rd2, [umin_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.min.u64 %rd3, [%rd1], %rd2;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %retval = atomicrmw umin ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @uinc_wrap_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: uinc_wrap_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<6>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b8 %rs1, [uinc_wrap_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 255;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: $L__BB44_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r15, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: and.b16 %rs3, %rs2, 255;
+; SM90-NEXT: add.s16 %rs4, %rs2, 1;
+; SM90-NEXT: setp.ge.u16 %p1, %rs3, %rs1;
+; SM90-NEXT: selp.b16 %rs5, 0, %rs4, %p1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM90-NEXT: and.b32 %r10, %r9, 255;
+; SM90-NEXT: shl.b32 %r11, %r10, %r1;
+; SM90-NEXT: and.b32 %r12, %r15, %r2;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p2, %r3, %r15;
+; SM90-NEXT: mov.b32 %r15, %r3;
+; SM90-NEXT: @%p2 bra $L__BB44_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @uinc_wrap_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: uinc_wrap_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<5>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [uinc_wrap_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB45_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: add.s16 %rs3, %rs2, 1;
+; SM90-NEXT: setp.ge.u16 %p1, %rs2, %rs1;
+; SM90-NEXT: selp.b16 %rs4, 0, %rs3, %p1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p2, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p2 bra $L__BB45_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @uinc_wrap_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: uinc_wrap_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [uinc_wrap_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [uinc_wrap_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.inc.u32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @uinc_wrap_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: uinc_wrap_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b64 %rd<7>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd3, [uinc_wrap_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [uinc_wrap_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM90-NEXT: $L__BB47_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: add.s64 %rd4, %rd6, 1;
+; SM90-NEXT: setp.ge.u64 %p1, %rd6, %rd3;
+; SM90-NEXT: selp.b64 %rd5, 0, %rd4, %p1;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM90-NEXT: setp.ne.b64 %p2, %rd1, %rd6;
+; SM90-NEXT: mov.b64 %rd6, %rd1;
+; SM90-NEXT: @%p2 bra $L__BB47_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw uinc_wrap ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @udec_wrap_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: udec_wrap_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<4>;
+; SM90-NEXT: .reg .b16 %rs<7>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b8 %rs1, [udec_wrap_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 255;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: $L__BB48_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r15, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: and.b16 %rs3, %rs2, 255;
+; SM90-NEXT: add.s16 %rs4, %rs2, -1;
+; SM90-NEXT: setp.eq.b16 %p1, %rs3, 0;
+; SM90-NEXT: setp.gt.u16 %p2, %rs3, %rs1;
+; SM90-NEXT: selp.b16 %rs5, %rs1, %rs4, %p2;
+; SM90-NEXT: selp.b16 %rs6, %rs1, %rs5, %p1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs6;
+; SM90-NEXT: and.b32 %r10, %r9, 255;
+; SM90-NEXT: shl.b32 %r11, %r10, %r1;
+; SM90-NEXT: and.b32 %r12, %r15, %r2;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p3, %r3, %r15;
+; SM90-NEXT: mov.b32 %r15, %r3;
+; SM90-NEXT: @%p3 bra $L__BB48_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @udec_wrap_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: udec_wrap_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<4>;
+; SM90-NEXT: .reg .b16 %rs<6>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [udec_wrap_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB49_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: add.s16 %rs3, %rs2, -1;
+; SM90-NEXT: setp.eq.b16 %p1, %rs2, 0;
+; SM90-NEXT: setp.gt.u16 %p2, %rs2, %rs1;
+; SM90-NEXT: selp.b16 %rs4, %rs1, %rs3, %p2;
+; SM90-NEXT: selp.b16 %rs5, %rs1, %rs4, %p1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p3, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p3 bra $L__BB49_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @udec_wrap_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: udec_wrap_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [udec_wrap_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [udec_wrap_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.dec.u32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @udec_wrap_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: udec_wrap_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<4>;
+; SM90-NEXT: .reg .b64 %rd<8>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd3, [udec_wrap_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [udec_wrap_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b64 %rd7, [%rd2];
+; SM90-NEXT: $L__BB51_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: add.s64 %rd4, %rd7, -1;
+; SM90-NEXT: setp.eq.b64 %p1, %rd7, 0;
+; SM90-NEXT: setp.gt.u64 %p2, %rd7, %rd3;
+; SM90-NEXT: selp.b64 %rd5, %rd3, %rd4, %p2;
+; SM90-NEXT: selp.b64 %rd6, %rd3, %rd5, %p1;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd7, %rd6;
+; SM90-NEXT: setp.ne.b64 %p3, %rd1, %rd7;
+; SM90-NEXT: mov.b64 %rd7, %rd1;
+; SM90-NEXT: @%p3 bra $L__BB51_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw udec_wrap ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @usub_cond_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: usub_cond_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<6>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b8 %rs1, [usub_cond_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 255;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: $L__BB52_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r15, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: and.b16 %rs3, %rs2, 255;
+; SM90-NEXT: setp.ge.u16 %p1, %rs3, %rs1;
+; SM90-NEXT: sub.s16 %rs4, %rs2, %rs1;
+; SM90-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM90-NEXT: and.b32 %r10, %r9, 255;
+; SM90-NEXT: shl.b32 %r11, %r10, %r1;
+; SM90-NEXT: and.b32 %r12, %r15, %r2;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p2, %r3, %r15;
+; SM90-NEXT: mov.b32 %r15, %r3;
+; SM90-NEXT: @%p2 bra $L__BB52_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @usub_cond_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: usub_cond_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b16 %rs<5>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [usub_cond_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB53_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: setp.ge.u16 %p1, %rs2, %rs1;
+; SM90-NEXT: sub.s16 %rs3, %rs2, %rs1;
+; SM90-NEXT: selp.b16 %rs4, %rs3, %rs2, %p1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p2, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p2 bra $L__BB53_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @usub_cond_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: usub_cond_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b32 %r<6>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [usub_cond_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [usub_cond_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b32 %r5, [%rd1];
+; SM90-NEXT: $L__BB54_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: setp.ge.u32 %p1, %r5, %r2;
+; SM90-NEXT: sub.s32 %r3, %r5, %r2;
+; SM90-NEXT: selp.b32 %r4, %r3, %r5, %p1;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM90-NEXT: setp.ne.b32 %p2, %r1, %r5;
+; SM90-NEXT: mov.b32 %r5, %r1;
+; SM90-NEXT: @%p2 bra $L__BB54_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @usub_cond_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: usub_cond_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<3>;
+; SM90-NEXT: .reg .b64 %rd<7>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd3, [usub_cond_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [usub_cond_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM90-NEXT: $L__BB55_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: setp.ge.u64 %p1, %rd6, %rd3;
+; SM90-NEXT: sub.s64 %rd4, %rd6, %rd3;
+; SM90-NEXT: selp.b64 %rd5, %rd4, %rd6, %p1;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM90-NEXT: setp.ne.b64 %p2, %rd1, %rd6;
+; SM90-NEXT: mov.b64 %rd6, %rd1;
+; SM90-NEXT: @%p2 bra $L__BB55_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw usub_cond ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define i8 @usub_sat_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: usub_sat_acq_rel_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<6>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b8 %rs1, [usub_sat_acq_rel_i8_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 255;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB56_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: and.b16 %rs3, %rs2, 255;
+; SM90-NEXT: max.u16 %rs4, %rs3, %rs1;
+; SM90-NEXT: sub.s16 %rs5, %rs4, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs5;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB56_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i8 %val syncscope("block") acq_rel
+ ret i8 %retval
+}
+
+define i16 @usub_sat_acq_rel_i16_global_cta(ptr addrspace(1) %addr, i16 %val) {
+; SM90-LABEL: usub_sat_acq_rel_i16_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<5>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [usub_sat_acq_rel_i16_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i16_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB57_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: max.u16 %rs3, %rs2, %rs1;
+; SM90-NEXT: sub.s16 %rs4, %rs3, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs4;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB57_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i16 %val syncscope("block") acq_rel
+ ret i16 %retval
+}
+
+define i32 @usub_sat_acq_rel_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: usub_sat_acq_rel_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<6>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [usub_sat_acq_rel_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [usub_sat_acq_rel_i32_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b32 %r5, [%rd1];
+; SM90-NEXT: $L__BB58_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: max.u32 %r3, %r5, %r2;
+; SM90-NEXT: sub.s32 %r4, %r3, %r2;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM90-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM90-NEXT: mov.b32 %r5, %r1;
+; SM90-NEXT: @%p1 bra $L__BB58_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i32 %val syncscope("block") acq_rel
+ ret i32 %retval
+}
+
+define i64 @usub_sat_acq_rel_i64_global_cta(ptr addrspace(1) %addr, i64 %val) {
+; SM90-LABEL: usub_sat_acq_rel_i64_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b64 %rd<7>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd3, [usub_sat_acq_rel_i64_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [usub_sat_acq_rel_i64_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b64 %rd6, [%rd2];
+; SM90-NEXT: $L__BB59_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: max.u64 %rd4, %rd6, %rd3;
+; SM90-NEXT: sub.s64 %rd5, %rd4, %rd3;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd6, %rd5;
+; SM90-NEXT: setp.ne.b64 %p1, %rd1, %rd6;
+; SM90-NEXT: mov.b64 %rd6, %rd1;
+; SM90-NEXT: @%p1 bra $L__BB59_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw usub_sat ptr addrspace(1) %addr, i64 %val syncscope("block") acq_rel
+ ret i64 %retval
+}
+
+define float @fadd_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM90-LABEL: fadd_acq_rel_float_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_float_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [fadd_acq_rel_float_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.add.f32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fsub_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM90-LABEL: fsub_acq_rel_float_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<5>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [fsub_acq_rel_float_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [fsub_acq_rel_float_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b32 %r4, [%rd1];
+; SM90-NEXT: $L__BB61_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: sub.rn.f32 %r3, %r4, %r2;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
+; SM90-NEXT: setp.ne.b32 %p1, %r1, %r4;
+; SM90-NEXT: mov.b32 %r4, %r1;
+; SM90-NEXT: @%p1 bra $L__BB61_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fmin_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM90-LABEL: fmin_acq_rel_float_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<5>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [fmin_acq_rel_float_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [fmin_acq_rel_float_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b32 %r4, [%rd1];
+; SM90-NEXT: $L__BB62_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: min.f32 %r3, %r4, %r2;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
+; SM90-NEXT: setp.ne.b32 %p1, %r1, %r4;
+; SM90-NEXT: mov.b32 %r4, %r1;
+; SM90-NEXT: @%p1 bra $L__BB62_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fmax_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM90-LABEL: fmax_acq_rel_float_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<5>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [fmax_acq_rel_float_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [fmax_acq_rel_float_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b32 %r4, [%rd1];
+; SM90-NEXT: $L__BB63_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: max.f32 %r3, %r4, %r2;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
+; SM90-NEXT: setp.ne.b32 %p1, %r1, %r4;
+; SM90-NEXT: mov.b32 %r4, %r1;
+; SM90-NEXT: @%p1 bra $L__BB63_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fminimum_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM90-LABEL: fminimum_acq_rel_float_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<5>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [fminimum_acq_rel_float_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [fminimum_acq_rel_float_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b32 %r4, [%rd1];
+; SM90-NEXT: $L__BB64_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: min.NaN.f32 %r3, %r4, %r2;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
+; SM90-NEXT: setp.ne.b32 %p1, %r1, %r4;
+; SM90-NEXT: mov.b32 %r4, %r1;
+; SM90-NEXT: @%p1 bra $L__BB64_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define float @fmaximum_acq_rel_float_global_cta(ptr addrspace(1) %addr, float %val) {
+; SM90-LABEL: fmaximum_acq_rel_float_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<5>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [fmaximum_acq_rel_float_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [fmaximum_acq_rel_float_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b32 %r4, [%rd1];
+; SM90-NEXT: $L__BB65_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: max.NaN.f32 %r3, %r4, %r2;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r4, %r3;
+; SM90-NEXT: setp.ne.b32 %p1, %r1, %r4;
+; SM90-NEXT: mov.b32 %r4, %r1;
+; SM90-NEXT: @%p1 bra $L__BB65_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, float %val syncscope("block") acq_rel
+ ret float %retval
+}
+
+define double @fadd_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM90-LABEL: fadd_acq_rel_double_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b64 %rd<4>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_double_global_cta_param_0];
+; SM90-NEXT: ld.param.b64 %rd2, [fadd_acq_rel_double_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.add.f64 %rd3, [%rd1], %rd2;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fsub_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM90-LABEL: fsub_acq_rel_double_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b64 %rd<6>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd3, [fsub_acq_rel_double_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_double_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b64 %rd5, [%rd2];
+; SM90-NEXT: $L__BB67_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: sub.rn.f64 %rd4, %rd5, %rd3;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
+; SM90-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
+; SM90-NEXT: mov.b64 %rd5, %rd1;
+; SM90-NEXT: @%p1 bra $L__BB67_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fmin_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM90-LABEL: fmin_acq_rel_double_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b64 %rd<6>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd3, [fmin_acq_rel_double_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_double_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b64 %rd5, [%rd2];
+; SM90-NEXT: $L__BB68_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: min.f64 %rd4, %rd5, %rd3;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
+; SM90-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
+; SM90-NEXT: mov.b64 %rd5, %rd1;
+; SM90-NEXT: @%p1 bra $L__BB68_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fmax_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM90-LABEL: fmax_acq_rel_double_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b64 %rd<6>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd3, [fmax_acq_rel_double_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_double_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b64 %rd5, [%rd2];
+; SM90-NEXT: $L__BB69_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: max.f64 %rd4, %rd5, %rd3;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd5, %rd4;
+; SM90-NEXT: setp.ne.b64 %p1, %rd1, %rd5;
+; SM90-NEXT: mov.b64 %rd5, %rd1;
+; SM90-NEXT: @%p1 bra $L__BB69_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fminimum_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM90-LABEL: fminimum_acq_rel_double_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<6>;
+; SM90-NEXT: .reg .b64 %rd<10>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd3, [fminimum_acq_rel_double_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_double_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b64 %rd9, [%rd2];
+; SM90-NEXT: setp.eq.b64 %p3, %rd3, -9223372036854775808;
+; SM90-NEXT: $L__BB70_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: setp.nan.f64 %p1, %rd9, %rd3;
+; SM90-NEXT: min.f64 %rd4, %rd9, %rd3;
+; SM90-NEXT: selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
+; SM90-NEXT: setp.eq.b64 %p2, %rd9, -9223372036854775808;
+; SM90-NEXT: selp.f64 %rd6, %rd9, %rd5, %p2;
+; SM90-NEXT: selp.f64 %rd7, %rd3, %rd6, %p3;
+; SM90-NEXT: setp.eq.f64 %p4, %rd5, 0d0000000000000000;
+; SM90-NEXT: selp.f64 %rd8, %rd7, %rd5, %p4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd9, %rd8;
+; SM90-NEXT: setp.ne.b64 %p5, %rd1, %rd9;
+; SM90-NEXT: mov.b64 %rd9, %rd1;
+; SM90-NEXT: @%p5 bra $L__BB70_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define double @fmaximum_acq_rel_double_global_cta(ptr addrspace(1) %addr, double %val) {
+; SM90-LABEL: fmaximum_acq_rel_double_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<6>;
+; SM90-NEXT: .reg .b64 %rd<10>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd3, [fmaximum_acq_rel_double_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_double_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b64 %rd9, [%rd2];
+; SM90-NEXT: setp.eq.b64 %p3, %rd3, 0;
+; SM90-NEXT: $L__BB71_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: setp.nan.f64 %p1, %rd9, %rd3;
+; SM90-NEXT: max.f64 %rd4, %rd9, %rd3;
+; SM90-NEXT: selp.f64 %rd5, 0d7FF8000000000000, %rd4, %p1;
+; SM90-NEXT: setp.eq.b64 %p2, %rd9, 0;
+; SM90-NEXT: selp.f64 %rd6, %rd9, %rd5, %p2;
+; SM90-NEXT: selp.f64 %rd7, %rd3, %rd6, %p3;
+; SM90-NEXT: setp.eq.f64 %p4, %rd5, 0d0000000000000000;
+; SM90-NEXT: selp.f64 %rd8, %rd7, %rd5, %p4;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b64 %rd1, [%rd2], %rd9, %rd8;
+; SM90-NEXT: setp.ne.b64 %p5, %rd1, %rd9;
+; SM90-NEXT: mov.b64 %rd9, %rd1;
+; SM90-NEXT: @%p5 bra $L__BB71_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b64 [func_retval0], %rd1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, double %val syncscope("block") acq_rel
+ ret double %retval
+}
+
+define half @fadd_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM90-LABEL: fadd_acq_rel_half_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b16 %rs<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_half_global_cta_param_0];
+; SM90-NEXT: ld.param.b16 %rs1, [fadd_acq_rel_half_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.add.noftz.f16 %rs2, [%rd1], %rs1;
+; SM90-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fsub_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM90-LABEL: fsub_acq_rel_half_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [fsub_acq_rel_half_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_half_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB73_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: sub.rn.f16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB73_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b16 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fmin_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM90-LABEL: fmin_acq_rel_half_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [fmin_acq_rel_half_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_half_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB74_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: min.f16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB74_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b16 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fmax_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM90-LABEL: fmax_acq_rel_half_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [fmax_acq_rel_half_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_half_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB75_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: max.f16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB75_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b16 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fminimum_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM90-LABEL: fminimum_acq_rel_half_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [fminimum_acq_rel_half_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_half_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB76_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: min.NaN.f16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB76_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b16 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define half @fmaximum_acq_rel_half_global_cta(ptr addrspace(1) %addr, half %val) {
+; SM90-LABEL: fmaximum_acq_rel_half_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [fmaximum_acq_rel_half_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_half_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB77_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: max.NaN.f16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB77_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b16 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, half %val syncscope("block") acq_rel
+ ret half %retval
+}
+
+define bfloat @fadd_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM90-LABEL: fadd_acq_rel_bfloat_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b16 %rs<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [fadd_acq_rel_bfloat_global_cta_param_0];
+; SM90-NEXT: ld.param.b16 %rs1, [fadd_acq_rel_bfloat_global_cta_param_1];
+; SM90-NEXT: atom.acq_rel.cta.global.add.noftz.bf16 %rs2, [%rd1], %rs1;
+; SM90-NEXT: st.param.b16 [func_retval0], %rs2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fadd ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fsub_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM90-LABEL: fsub_acq_rel_bfloat_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [fsub_acq_rel_bfloat_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fsub_acq_rel_bfloat_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB79_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: sub.rn.bf16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB79_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b16 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fsub ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fmin_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM90-LABEL: fmin_acq_rel_bfloat_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [fmin_acq_rel_bfloat_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fmin_acq_rel_bfloat_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB80_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: min.bf16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB80_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b16 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmin ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fmax_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM90-LABEL: fmax_acq_rel_bfloat_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [fmax_acq_rel_bfloat_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fmax_acq_rel_bfloat_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB81_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: max.bf16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB81_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b16 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmax ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fminimum_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM90-LABEL: fminimum_acq_rel_bfloat_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [fminimum_acq_rel_bfloat_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fminimum_acq_rel_bfloat_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB82_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: min.NaN.bf16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB82_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b16 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fminimum ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define bfloat @fmaximum_acq_rel_bfloat_global_cta(ptr addrspace(1) %addr, bfloat %val) {
+; SM90-LABEL: fmaximum_acq_rel_bfloat_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b16 %rs<4>;
+; SM90-NEXT: .reg .b32 %r<15>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b16 %rs1, [fmaximum_acq_rel_bfloat_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd2, [fmaximum_acq_rel_bfloat_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r4, %rd2;
+; SM90-NEXT: and.b32 %r5, %r4, 3;
+; SM90-NEXT: shl.b32 %r1, %r5, 3;
+; SM90-NEXT: mov.b32 %r6, 65535;
+; SM90-NEXT: shl.b32 %r7, %r6, %r1;
+; SM90-NEXT: not.b32 %r2, %r7;
+; SM90-NEXT: ld.global.b32 %r14, [%rd1];
+; SM90-NEXT: $L__BB83_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: shr.u32 %r8, %r14, %r1;
+; SM90-NEXT: cvt.u16.u32 %rs2, %r8;
+; SM90-NEXT: max.NaN.bf16 %rs3, %rs2, %rs1;
+; SM90-NEXT: cvt.u32.u16 %r9, %rs3;
+; SM90-NEXT: shl.b32 %r10, %r9, %r1;
+; SM90-NEXT: and.b32 %r11, %r14, %r2;
+; SM90-NEXT: or.b32 %r12, %r11, %r10;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r3, [%rd1], %r14, %r12;
+; SM90-NEXT: setp.ne.b32 %p1, %r3, %r14;
+; SM90-NEXT: mov.b32 %r14, %r3;
+; SM90-NEXT: @%p1 bra $L__BB83_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r13, %r3, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b16 [func_retval0], %r13;
+; SM90-NEXT: ret;
+ %retval = atomicrmw fmaximum ptr addrspace(1) %addr, bfloat %val syncscope("block") acq_rel
+ ret bfloat %retval
+}
+
+define i8 @add_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: add_monotonic_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [add_monotonic_i8_global_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.b8 %r6, [add_monotonic_i8_global_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 255;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: $L__BB84_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: add.s32 %r10, %r15, %r4;
+; SM90-NEXT: and.b32 %r11, %r10, %r2;
+; SM90-NEXT: and.b32 %r12, %r15, %r3;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM90-NEXT: mov.b32 %r15, %r5;
+; SM90-NEXT: @%p1 bra $L__BB84_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r5, %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") monotonic
+ ret i8 %retval
+}
+
+define i8 @add_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: add_acquire_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [add_acquire_i8_global_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.b8 %r6, [add_acquire_i8_global_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 255;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: $L__BB85_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: add.s32 %r10, %r15, %r4;
+; SM90-NEXT: and.b32 %r11, %r10, %r2;
+; SM90-NEXT: and.b32 %r12, %r15, %r3;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM90-NEXT: mov.b32 %r15, %r5;
+; SM90-NEXT: @%p1 bra $L__BB85_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r5, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") acquire
+ ret i8 %retval
+}
+
+define i8 @add_release_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: add_release_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [add_release_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b8 %r6, [add_release_i8_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 255;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: $L__BB86_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: add.s32 %r10, %r15, %r4;
+; SM90-NEXT: and.b32 %r11, %r10, %r2;
+; SM90-NEXT: and.b32 %r12, %r15, %r3;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM90-NEXT: mov.b32 %r15, %r5;
+; SM90-NEXT: @%p1 bra $L__BB86_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r5, %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") release
+ ret i8 %retval
+}
+
+define i8 @add_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: add_seq_cst_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<16>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [add_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.b8 %r6, [add_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 255;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r15, [%rd1];
+; SM90-NEXT: $L__BB87_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: add.s32 %r10, %r15, %r4;
+; SM90-NEXT: and.b32 %r11, %r10, %r2;
+; SM90-NEXT: and.b32 %r12, %r15, %r3;
+; SM90-NEXT: or.b32 %r13, %r12, %r11;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r13;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r15;
+; SM90-NEXT: mov.b32 %r15, %r5;
+; SM90-NEXT: @%p1 bra $L__BB87_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r14, %r5, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r14;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i8 %val syncscope("block") seq_cst
+ ret i8 %retval
+}
+
+define i32 @add_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: add_monotonic_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [add_monotonic_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [add_monotonic_i32_global_cta_param_1];
+; SM90-NEXT: atom.relaxed.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") monotonic
+ ret i32 %retval
+}
+
+define i32 @add_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: add_acquire_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [add_acquire_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [add_acquire_i32_global_cta_param_1];
+; SM90-NEXT: atom.acquire.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") acquire
+ ret i32 %retval
+}
+
+define i32 @add_release_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: add_release_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [add_release_i32_global_cta_param_0];
+; SM90-NEXT: ld.param.b32 %r1, [add_release_i32_global_cta_param_1];
+; SM90-NEXT: atom.release.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") release
+ ret i32 %retval
+}
+
+define i32 @add_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: add_seq_cst_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .b32 %r<3>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd1, [add_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.b32 %r1, [add_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT: atom.acquire.cta.global.add.u32 %r2, [%rd1], %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r2;
+; SM90-NEXT: ret;
+ %retval = atomicrmw add ptr addrspace(1) %addr, i32 %val syncscope("block") seq_cst
+ ret i32 %retval
+}
+
+define i8 @nand_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: nand_monotonic_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<17>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [nand_monotonic_i8_global_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.b8 %r6, [nand_monotonic_i8_global_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 255;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r16, [%rd1];
+; SM90-NEXT: $L__BB92_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r10, %r16, %r4;
+; SM90-NEXT: not.b32 %r11, %r10;
+; SM90-NEXT: and.b32 %r12, %r11, %r2;
+; SM90-NEXT: and.b32 %r13, %r16, %r3;
+; SM90-NEXT: or.b32 %r14, %r13, %r12;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM90-NEXT: mov.b32 %r16, %r5;
+; SM90-NEXT: @%p1 bra $L__BB92_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r15, %r5, %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r15;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") monotonic
+ ret i8 %retval
+}
+
+define i8 @nand_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: nand_acquire_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<17>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [nand_acquire_i8_global_cta_param_0];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: ld.param.b8 %r6, [nand_acquire_i8_global_cta_param_1];
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 255;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r16, [%rd1];
+; SM90-NEXT: $L__BB93_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r10, %r16, %r4;
+; SM90-NEXT: not.b32 %r11, %r10;
+; SM90-NEXT: and.b32 %r12, %r11, %r2;
+; SM90-NEXT: and.b32 %r13, %r16, %r3;
+; SM90-NEXT: or.b32 %r14, %r13, %r12;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM90-NEXT: mov.b32 %r16, %r5;
+; SM90-NEXT: @%p1 bra $L__BB93_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r15, %r5, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r15;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") acquire
+ ret i8 %retval
+}
+
+define i8 @nand_release_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: nand_release_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<17>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [nand_release_i8_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.param.b8 %r6, [nand_release_i8_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 255;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r16, [%rd1];
+; SM90-NEXT: $L__BB94_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r10, %r16, %r4;
+; SM90-NEXT: not.b32 %r11, %r10;
+; SM90-NEXT: and.b32 %r12, %r11, %r2;
+; SM90-NEXT: and.b32 %r13, %r16, %r3;
+; SM90-NEXT: or.b32 %r14, %r13, %r12;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM90-NEXT: mov.b32 %r16, %r5;
+; SM90-NEXT: @%p1 bra $L__BB94_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r15, %r5, %r1;
+; SM90-NEXT: st.param.b32 [func_retval0], %r15;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") release
+ ret i8 %retval
+}
+
+define i8 @nand_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
+; SM90-LABEL: nand_seq_cst_i8_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<17>;
+; SM90-NEXT: .reg .b64 %rd<3>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b64 %rd2, [nand_seq_cst_i8_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.param.b8 %r6, [nand_seq_cst_i8_global_cta_param_1];
+; SM90-NEXT: and.b64 %rd1, %rd2, -4;
+; SM90-NEXT: cvt.u32.u64 %r7, %rd2;
+; SM90-NEXT: and.b32 %r8, %r7, 3;
+; SM90-NEXT: shl.b32 %r1, %r8, 3;
+; SM90-NEXT: mov.b32 %r9, 255;
+; SM90-NEXT: shl.b32 %r2, %r9, %r1;
+; SM90-NEXT: not.b32 %r3, %r2;
+; SM90-NEXT: shl.b32 %r4, %r6, %r1;
+; SM90-NEXT: ld.global.b32 %r16, [%rd1];
+; SM90-NEXT: $L__BB95_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r10, %r16, %r4;
+; SM90-NEXT: not.b32 %r11, %r10;
+; SM90-NEXT: and.b32 %r12, %r11, %r2;
+; SM90-NEXT: and.b32 %r13, %r16, %r3;
+; SM90-NEXT: or.b32 %r14, %r13, %r12;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r14;
+; SM90-NEXT: setp.ne.b32 %p1, %r5, %r16;
+; SM90-NEXT: mov.b32 %r16, %r5;
+; SM90-NEXT: @%p1 bra $L__BB95_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: shr.u32 %r15, %r5, %r1;
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r15;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i8 %val syncscope("block") seq_cst
+ ret i8 %retval
+}
+
+define i32 @nand_monotonic_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: nand_monotonic_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<6>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [nand_monotonic_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [nand_monotonic_i32_global_cta_param_0];
+; SM90-NEXT: ld.global.b32 %r5, [%rd1];
+; SM90-NEXT: $L__BB96_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r3, %r5, %r2;
+; SM90-NEXT: not.b32 %r4, %r3;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM90-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM90-NEXT: mov.b32 %r5, %r1;
+; SM90-NEXT: @%p1 bra $L__BB96_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") monotonic
+ ret i32 %retval
+}
+
+define i32 @nand_acquire_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: nand_acquire_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<6>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [nand_acquire_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [nand_acquire_i32_global_cta_param_0];
+; SM90-NEXT: ld.global.b32 %r5, [%rd1];
+; SM90-NEXT: $L__BB97_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r3, %r5, %r2;
+; SM90-NEXT: not.b32 %r4, %r3;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM90-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM90-NEXT: mov.b32 %r5, %r1;
+; SM90-NEXT: @%p1 bra $L__BB97_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") acquire
+ ret i32 %retval
+}
+
+define i32 @nand_release_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: nand_release_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<6>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [nand_release_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [nand_release_i32_global_cta_param_0];
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ld.global.b32 %r5, [%rd1];
+; SM90-NEXT: $L__BB98_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r3, %r5, %r2;
+; SM90-NEXT: not.b32 %r4, %r3;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM90-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM90-NEXT: mov.b32 %r5, %r1;
+; SM90-NEXT: @%p1 bra $L__BB98_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") release
+ ret i32 %retval
+}
+
+define i32 @nand_seq_cst_i32_global_cta(ptr addrspace(1) %addr, i32 %val) {
+; SM90-LABEL: nand_seq_cst_i32_global_cta(
+; SM90: {
+; SM90-NEXT: .reg .pred %p<2>;
+; SM90-NEXT: .reg .b32 %r<6>;
+; SM90-NEXT: .reg .b64 %rd<2>;
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: ld.param.b32 %r2, [nand_seq_cst_i32_global_cta_param_1];
+; SM90-NEXT: ld.param.b64 %rd1, [nand_seq_cst_i32_global_cta_param_0];
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ld.global.b32 %r5, [%rd1];
+; SM90-NEXT: $L__BB99_1: // %atomicrmw.start
+; SM90-NEXT: // =>This Inner Loop Header: Depth=1
+; SM90-NEXT: and.b32 %r3, %r5, %r2;
+; SM90-NEXT: not.b32 %r4, %r3;
+; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r1, [%rd1], %r5, %r4;
+; SM90-NEXT: setp.ne.b32 %p1, %r1, %r5;
+; SM90-NEXT: mov.b32 %r5, %r1;
+; SM90-NEXT: @%p1 bra $L__BB99_1;
+; SM90-NEXT: // %bb.2: // %atomicrmw.end
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: st.param.b32 [func_retval0], %r1;
+; SM90-NEXT: ret;
+ %retval = atomicrmw nand ptr addrspace(1) %addr, i32 %val syncscope("block") seq_cst
+ ret i32 %retval
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw.py b/llvm/test/CodeGen/NVPTX/atomicrmw.py
new file mode 100644
index 0000000000000..6417891c56c57
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/atomicrmw.py
@@ -0,0 +1,120 @@
+# For manual usage, not as a part of lit tests. Used for generating the following tests:
+
+from string import Template
+from itertools import product
+
+TESTS = [(60, 50), (70, 63), (90, 87)]
+
+LLVM_SCOPES = ["", "block", "cluster", "device"]
+
+SCOPE_LLVM_TO_PTX = {"": "sys", "block": "cta", "cluster": "cluster", "device": "gpu"}
+
+ORDERINGS = ["monotonic", "acquire", "release", "acq_rel", "seq_cst"]
+
+INTEGER_OPERATIONS = [
+ "xchg",
+ "add",
+ "sub",
+ "and",
+ "nand",
+ "or",
+ "xor",
+ "max",
+ "min",
+ "umax",
+ "umin",
+ "uinc_wrap",
+ "udec_wrap",
+ "usub_cond",
+ "usub_sat",
+]
+
+FLOATING_POINT_OPERATIONS = ["fadd", "fsub", "fmin", "fmax", "fminimum", "fmaximum"]
+
+ADDRSPACES = [0, 1, 3]
+
+ADDRSPACE_NUM_TO_ADDRSPACE = {0: "generic", 1: "global", 3: "shared"}
+
+atomicrmw_func = Template(
+ """define ${datatype} @${operation}_${ordering}_${datatype}_${addrspace}_${ptx_scope}(ptr${addrspace_cast} %addr, ${datatype} %val) {
+ %retval = atomicrmw ${operation} ptr ${addrspace_cast} %addr, ${datatype} %val syncscope(\"${llvm_scope}\") ${ordering}
+ ret $datatype %retval
+}
+"""
+)
+
+run_statement = Template(
+ """; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %}
+"""
+)
+
+
+def get_addrspace_cast(addrspace):
+ if addrspace == 0:
+ return ""
+ else:
+ return " addrspace({})".format(str(addrspace))
+
+
+if __name__ == "__main__":
+ for sm, ptx in TESTS:
+ # Slice 1: Keep addrspace, llvm_scope, ordering fixed, generate all possible operations and sizes
+ with open("atomicrmw-sm{}.ll".format(str(sm)), "w") as fp:
+ print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
+ # Integer operations
+ addrspace, llvm_scope, ordering = 1, "block", "acq_rel"
+ for operation, datatype in product(
+ INTEGER_OPERATIONS, ["i8", "i16", "i32", "i64"]
+ ):
+ print(
+ atomicrmw_func.substitute(
+ operation=operation,
+ ordering=ordering,
+ datatype=datatype,
+ addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+ ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
+ llvm_scope=llvm_scope,
+ addrspace_cast=get_addrspace_cast(addrspace),
+ ),
+ file=fp,
+ )
+
+ # Floating point add
+ for datatype, operation in product(
+ ["float", "double", "half", "bfloat"], FLOATING_POINT_OPERATIONS
+ ):
+ print(
+ atomicrmw_func.substitute(
+ operation=operation,
+ ordering=ordering,
+ datatype=datatype,
+ addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+ ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
+ llvm_scope=llvm_scope,
+ addrspace_cast=get_addrspace_cast(addrspace),
+ ),
+ file=fp,
+ )
+
+ # Slice 2: Keep addrspace, llvm_scope fixed, and generate all possible orderings for operations add and nand.
+ # add is natively supported for larger bitwidths, while nand is emulated always
+ addrspace, llvm_scope = 1, "block"
+ for operation, datatype, ordering in product(
+ ["add", "nand"], ["i8", "i32"], ORDERINGS
+ ):
+ if addrspace == 1 and llvm_scope == "block" and ordering == "acq_rel":
+ # These are a part of Slice 1
+ continue
+ print(
+ atomicrmw_func.substitute(
+ operation=operation,
+ ordering=ordering,
+ datatype=datatype,
+ addrspace=ADDRSPACE_NUM_TO_ADDRSPACE[addrspace],
+ addrspace_cast=get_addrspace_cast(addrspace),
+ ptx_scope=SCOPE_LLVM_TO_PTX[llvm_scope],
+ llvm_scope=llvm_scope,
+ ),
+ file=fp,
+ )
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll
index ae10526ec8365..10e1ca434f271 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm60.ll
@@ -5,26 +5,15 @@
; CHECK-LABEL: .func test(
define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, double %d) {
-; CHECK: atom.add.f64
+; CHECK: atom.sys.add.f64
%r1 = call double @llvm.nvvm.atomic.load.add.f64.p0(ptr %dp0, double %d)
-; CHECK: atom.global.add.f64
+; CHECK: atom.sys.global.add.f64
%r2 = call double @llvm.nvvm.atomic.load.add.f64.p1(ptr addrspace(1) %dp1, double %d)
-; CHECK: atom.shared.add.f64
+; CHECK: atom.sys.shared.add.f64
%ret = call double @llvm.nvvm.atomic.load.add.f64.p3(ptr addrspace(3) %dp3, double %d)
ret void
}
-; CHECK-LABEL: .func test2(
-define void @test2(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, double %d) {
-; CHECK: atom.add.f64
- %r1 = atomicrmw fadd ptr %dp0, double %d seq_cst
-; CHECK: atom.global.add.f64
- %r2 = atomicrmw fadd ptr addrspace(1) %dp1, double %d seq_cst
-; CHECK: atom.shared.add.f64
- %ret = atomicrmw fadd ptr addrspace(3) %dp3, double %d seq_cst
- ret void
-}
-
declare double @llvm.nvvm.atomic.load.add.f64.p0(ptr nocapture, double) #1
declare double @llvm.nvvm.atomic.load.add.f64.p1(ptr addrspace(1) nocapture, double) #1
declare double @llvm.nvvm.atomic.load.add.f64.p3(ptr addrspace(3) nocapture, double) #1
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
deleted file mode 100644
index e2762bac45a35..0000000000000
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ /dev/null
@@ -1,144 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefixes=CHECK64
-; RUN: llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | FileCheck %s --check-prefixes=CHECKPTX62
-; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
-; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
-; RUN: %if ptxas-sm_70 && ptxas-isa-6.2 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_70 -mattr=+ptx62 | %ptxas-verify -arch=sm_70 %}
-
-target triple = "nvptx64-nvidia-cuda"
-
-define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %val) {
-; CHECK-LABEL: test(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<7>;
-; CHECK-NEXT: .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r1, [test_param_0];
-; CHECK-NEXT: ld.param.b16 %rs1, [test_param_3];
-; CHECK-NEXT: atom.add.noftz.f16 %rs2, [%r1], %rs1;
-; CHECK-NEXT: ld.param.b32 %r2, [test_param_1];
-; CHECK-NEXT: mov.b16 %rs3, 0x3C00;
-; CHECK-NEXT: atom.add.noftz.f16 %rs4, [%r1], %rs3;
-; CHECK-NEXT: ld.param.b32 %r3, [test_param_2];
-; CHECK-NEXT: atom.global.add.noftz.f16 %rs5, [%r2], %rs1;
-; CHECK-NEXT: atom.shared.add.noftz.f16 %rs6, [%r3], %rs1;
-; CHECK-NEXT: ret;
-;
-; CHECK64-LABEL: test(
-; CHECK64: {
-; CHECK64-NEXT: .reg .b16 %rs<7>;
-; CHECK64-NEXT: .reg .b64 %rd<4>;
-; CHECK64-EMPTY:
-; CHECK64-NEXT: // %bb.0:
-; CHECK64-NEXT: ld.param.b64 %rd1, [test_param_0];
-; CHECK64-NEXT: ld.param.b16 %rs1, [test_param_3];
-; CHECK64-NEXT: atom.add.noftz.f16 %rs2, [%rd1], %rs1;
-; CHECK64-NEXT: ld.param.b64 %rd2, [test_param_1];
-; CHECK64-NEXT: mov.b16 %rs3, 0x3C00;
-; CHECK64-NEXT: atom.add.noftz.f16 %rs4, [%rd1], %rs3;
-; CHECK64-NEXT: ld.param.b64 %rd3, [test_param_2];
-; CHECK64-NEXT: atom.global.add.noftz.f16 %rs5, [%rd2], %rs1;
-; CHECK64-NEXT: atom.shared.add.noftz.f16 %rs6, [%rd3], %rs1;
-; CHECK64-NEXT: ret;
-;
-; CHECKPTX62-LABEL: test(
-; CHECKPTX62: {
-; CHECKPTX62-NEXT: .reg .pred %p<5>;
-; CHECKPTX62-NEXT: .reg .b16 %rs<11>;
-; CHECKPTX62-NEXT: .reg .b32 %r<50>;
-; CHECKPTX62-EMPTY:
-; CHECKPTX62-NEXT: // %bb.0:
-; CHECKPTX62-NEXT: ld.param.b16 %rs1, [test_param_3];
-; CHECKPTX62-NEXT: ld.param.b32 %r15, [test_param_2];
-; CHECKPTX62-NEXT: ld.param.b32 %r14, [test_param_1];
-; CHECKPTX62-NEXT: ld.param.b32 %r16, [test_param_0];
-; CHECKPTX62-NEXT: and.b32 %r1, %r16, -4;
-; CHECKPTX62-NEXT: and.b32 %r17, %r16, 3;
-; CHECKPTX62-NEXT: shl.b32 %r2, %r17, 3;
-; CHECKPTX62-NEXT: mov.b32 %r18, 65535;
-; CHECKPTX62-NEXT: shl.b32 %r19, %r18, %r2;
-; CHECKPTX62-NEXT: not.b32 %r3, %r19;
-; CHECKPTX62-NEXT: ld.b32 %r46, [%r1];
-; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45
-; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT: shr.u32 %r20, %r46, %r2;
-; CHECKPTX62-NEXT: cvt.u16.u32 %rs2, %r20;
-; CHECKPTX62-NEXT: add.rn.f16 %rs3, %rs2, %rs1;
-; CHECKPTX62-NEXT: cvt.u32.u16 %r21, %rs3;
-; CHECKPTX62-NEXT: shl.b32 %r22, %r21, %r2;
-; CHECKPTX62-NEXT: and.b32 %r23, %r46, %r3;
-; CHECKPTX62-NEXT: or.b32 %r24, %r23, %r22;
-; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
-; CHECKPTX62-NEXT: setp.ne.b32 %p1, %r4, %r46;
-; CHECKPTX62-NEXT: mov.b32 %r46, %r4;
-; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1;
-; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44
-; CHECKPTX62-NEXT: ld.b32 %r47, [%r1];
-; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27
-; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT: shr.u32 %r25, %r47, %r2;
-; CHECKPTX62-NEXT: cvt.u16.u32 %rs4, %r25;
-; CHECKPTX62-NEXT: mov.b16 %rs5, 0x3C00;
-; CHECKPTX62-NEXT: add.rn.f16 %rs6, %rs4, %rs5;
-; CHECKPTX62-NEXT: cvt.u32.u16 %r26, %rs6;
-; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2;
-; CHECKPTX62-NEXT: and.b32 %r28, %r47, %r3;
-; CHECKPTX62-NEXT: or.b32 %r29, %r28, %r27;
-; CHECKPTX62-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
-; CHECKPTX62-NEXT: setp.ne.b32 %p2, %r5, %r47;
-; CHECKPTX62-NEXT: mov.b32 %r47, %r5;
-; CHECKPTX62-NEXT: @%p2 bra $L__BB0_3;
-; CHECKPTX62-NEXT: // %bb.4: // %atomicrmw.end26
-; CHECKPTX62-NEXT: and.b32 %r6, %r14, -4;
-; CHECKPTX62-NEXT: shl.b32 %r30, %r14, 3;
-; CHECKPTX62-NEXT: and.b32 %r7, %r30, 24;
-; CHECKPTX62-NEXT: mov.b32 %r31, 65535;
-; CHECKPTX62-NEXT: shl.b32 %r32, %r31, %r7;
-; CHECKPTX62-NEXT: not.b32 %r8, %r32;
-; CHECKPTX62-NEXT: ld.global.b32 %r48, [%r6];
-; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9
-; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT: shr.u32 %r33, %r48, %r7;
-; CHECKPTX62-NEXT: cvt.u16.u32 %rs7, %r33;
-; CHECKPTX62-NEXT: add.rn.f16 %rs8, %rs7, %rs1;
-; CHECKPTX62-NEXT: cvt.u32.u16 %r34, %rs8;
-; CHECKPTX62-NEXT: shl.b32 %r35, %r34, %r7;
-; CHECKPTX62-NEXT: and.b32 %r36, %r48, %r8;
-; CHECKPTX62-NEXT: or.b32 %r37, %r36, %r35;
-; CHECKPTX62-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
-; CHECKPTX62-NEXT: setp.ne.b32 %p3, %r9, %r48;
-; CHECKPTX62-NEXT: mov.b32 %r48, %r9;
-; CHECKPTX62-NEXT: @%p3 bra $L__BB0_5;
-; CHECKPTX62-NEXT: // %bb.6: // %atomicrmw.end8
-; CHECKPTX62-NEXT: and.b32 %r10, %r15, -4;
-; CHECKPTX62-NEXT: shl.b32 %r38, %r15, 3;
-; CHECKPTX62-NEXT: and.b32 %r11, %r38, 24;
-; CHECKPTX62-NEXT: mov.b32 %r39, 65535;
-; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11;
-; CHECKPTX62-NEXT: not.b32 %r12, %r40;
-; CHECKPTX62-NEXT: ld.shared.b32 %r49, [%r10];
-; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start
-; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX62-NEXT: shr.u32 %r41, %r49, %r11;
-; CHECKPTX62-NEXT: cvt.u16.u32 %rs9, %r41;
-; CHECKPTX62-NEXT: add.rn.f16 %rs10, %rs9, %rs1;
-; CHECKPTX62-NEXT: cvt.u32.u16 %r42, %rs10;
-; CHECKPTX62-NEXT: shl.b32 %r43, %r42, %r11;
-; CHECKPTX62-NEXT: and.b32 %r44, %r49, %r12;
-; CHECKPTX62-NEXT: or.b32 %r45, %r44, %r43;
-; CHECKPTX62-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
-; CHECKPTX62-NEXT: setp.ne.b32 %p4, %r13, %r49;
-; CHECKPTX62-NEXT: mov.b32 %r49, %r13;
-; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7;
-; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end
-; CHECKPTX62-NEXT: ret;
- %r1 = atomicrmw fadd ptr %dp0, half %val monotonic
- %r2 = atomicrmw fadd ptr %dp0, half 1.0 monotonic
- %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val monotonic
- %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val monotonic
- ret void
-}
-
-attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
deleted file mode 100644
index e6c6a73eef14d..0000000000000
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefixes=CHECK64
-; RUN: llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | FileCheck %s --check-prefixes=CHECKPTX71
-; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
-; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
-; RUN: %if ptxas-sm_86 && ptxas-isa-7.1 && ptxas-ptr32 %{ llc < %s -mtriple=nvptx -mcpu=sm_86 -mattr=+ptx71 | %ptxas-verify -arch=sm_86 %}
-
-target triple = "nvptx64-nvidia-cuda"
-
-define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat %val) {
-; CHECK-LABEL: test(
-; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<7>;
-; CHECK-NEXT: .reg .b32 %r<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b32 %r1, [test_param_0];
-; CHECK-NEXT: ld.param.b16 %rs1, [test_param_3];
-; CHECK-NEXT: atom.add.noftz.bf16 %rs2, [%r1], %rs1;
-; CHECK-NEXT: ld.param.b32 %r2, [test_param_1];
-; CHECK-NEXT: mov.b16 %rs3, 0x3F80;
-; CHECK-NEXT: atom.add.noftz.bf16 %rs4, [%r1], %rs3;
-; CHECK-NEXT: ld.param.b32 %r3, [test_param_2];
-; CHECK-NEXT: atom.global.add.noftz.bf16 %rs5, [%r2], %rs1;
-; CHECK-NEXT: atom.shared.add.noftz.bf16 %rs6, [%r3], %rs1;
-; CHECK-NEXT: ret;
-;
-; CHECK64-LABEL: test(
-; CHECK64: {
-; CHECK64-NEXT: .reg .b16 %rs<7>;
-; CHECK64-NEXT: .reg .b64 %rd<4>;
-; CHECK64-EMPTY:
-; CHECK64-NEXT: // %bb.0:
-; CHECK64-NEXT: ld.param.b64 %rd1, [test_param_0];
-; CHECK64-NEXT: ld.param.b16 %rs1, [test_param_3];
-; CHECK64-NEXT: atom.add.noftz.bf16 %rs2, [%rd1], %rs1;
-; CHECK64-NEXT: ld.param.b64 %rd2, [test_param_1];
-; CHECK64-NEXT: mov.b16 %rs3, 0x3F80;
-; CHECK64-NEXT: atom.add.noftz.bf16 %rs4, [%rd1], %rs3;
-; CHECK64-NEXT: ld.param.b64 %rd3, [test_param_2];
-; CHECK64-NEXT: atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1;
-; CHECK64-NEXT: atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1;
-; CHECK64-NEXT: ret;
-;
-; CHECKPTX71-LABEL: test(
-; CHECKPTX71: {
-; CHECKPTX71-NEXT: .reg .pred %p<5>;
-; CHECKPTX71-NEXT: .reg .b16 %rs<14>;
-; CHECKPTX71-NEXT: .reg .b32 %r<50>;
-; CHECKPTX71-EMPTY:
-; CHECKPTX71-NEXT: // %bb.0:
-; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
-; CHECKPTX71-NEXT: ld.param.b32 %r15, [test_param_2];
-; CHECKPTX71-NEXT: ld.param.b32 %r14, [test_param_1];
-; CHECKPTX71-NEXT: ld.param.b32 %r16, [test_param_0];
-; CHECKPTX71-NEXT: and.b32 %r1, %r16, -4;
-; CHECKPTX71-NEXT: and.b32 %r17, %r16, 3;
-; CHECKPTX71-NEXT: shl.b32 %r2, %r17, 3;
-; CHECKPTX71-NEXT: mov.b32 %r18, 65535;
-; CHECKPTX71-NEXT: shl.b32 %r19, %r18, %r2;
-; CHECKPTX71-NEXT: not.b32 %r3, %r19;
-; CHECKPTX71-NEXT: ld.b32 %r46, [%r1];
-; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45
-; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: shr.u32 %r20, %r46, %r2;
-; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r20;
-; CHECKPTX71-NEXT: mov.b16 %rs3, 0x3F80;
-; CHECKPTX71-NEXT: fma.rn.bf16 %rs4, %rs2, %rs3, %rs1;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r21, %rs4;
-; CHECKPTX71-NEXT: shl.b32 %r22, %r21, %r2;
-; CHECKPTX71-NEXT: and.b32 %r23, %r46, %r3;
-; CHECKPTX71-NEXT: or.b32 %r24, %r23, %r22;
-; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r4, [%r1], %r46, %r24;
-; CHECKPTX71-NEXT: setp.ne.b32 %p1, %r4, %r46;
-; CHECKPTX71-NEXT: mov.b32 %r46, %r4;
-; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
-; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44
-; CHECKPTX71-NEXT: ld.b32 %r47, [%r1];
-; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27
-; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: shr.u32 %r25, %r47, %r2;
-; CHECKPTX71-NEXT: cvt.u16.u32 %rs5, %r25;
-; CHECKPTX71-NEXT: mov.b16 %rs6, 0x3F80;
-; CHECKPTX71-NEXT: fma.rn.bf16 %rs7, %rs5, %rs6, %rs6;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r26, %rs7;
-; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
-; CHECKPTX71-NEXT: and.b32 %r28, %r47, %r3;
-; CHECKPTX71-NEXT: or.b32 %r29, %r28, %r27;
-; CHECKPTX71-NEXT: atom.relaxed.sys.cas.b32 %r5, [%r1], %r47, %r29;
-; CHECKPTX71-NEXT: setp.ne.b32 %p2, %r5, %r47;
-; CHECKPTX71-NEXT: mov.b32 %r47, %r5;
-; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
-; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26
-; CHECKPTX71-NEXT: and.b32 %r6, %r14, -4;
-; CHECKPTX71-NEXT: shl.b32 %r30, %r14, 3;
-; CHECKPTX71-NEXT: and.b32 %r7, %r30, 24;
-; CHECKPTX71-NEXT: mov.b32 %r31, 65535;
-; CHECKPTX71-NEXT: shl.b32 %r32, %r31, %r7;
-; CHECKPTX71-NEXT: not.b32 %r8, %r32;
-; CHECKPTX71-NEXT: ld.global.b32 %r48, [%r6];
-; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9
-; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: shr.u32 %r33, %r48, %r7;
-; CHECKPTX71-NEXT: cvt.u16.u32 %rs8, %r33;
-; CHECKPTX71-NEXT: mov.b16 %rs9, 0x3F80;
-; CHECKPTX71-NEXT: fma.rn.bf16 %rs10, %rs8, %rs9, %rs1;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs10;
-; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r7;
-; CHECKPTX71-NEXT: and.b32 %r36, %r48, %r8;
-; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
-; CHECKPTX71-NEXT: atom.relaxed.sys.global.cas.b32 %r9, [%r6], %r48, %r37;
-; CHECKPTX71-NEXT: setp.ne.b32 %p3, %r9, %r48;
-; CHECKPTX71-NEXT: mov.b32 %r48, %r9;
-; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
-; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8
-; CHECKPTX71-NEXT: and.b32 %r10, %r15, -4;
-; CHECKPTX71-NEXT: shl.b32 %r38, %r15, 3;
-; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
-; CHECKPTX71-NEXT: mov.b32 %r39, 65535;
-; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11;
-; CHECKPTX71-NEXT: not.b32 %r12, %r40;
-; CHECKPTX71-NEXT: ld.shared.b32 %r49, [%r10];
-; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
-; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: shr.u32 %r41, %r49, %r11;
-; CHECKPTX71-NEXT: cvt.u16.u32 %rs11, %r41;
-; CHECKPTX71-NEXT: mov.b16 %rs12, 0x3F80;
-; CHECKPTX71-NEXT: fma.rn.bf16 %rs13, %rs11, %rs12, %rs1;
-; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs13;
-; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
-; CHECKPTX71-NEXT: and.b32 %r44, %r49, %r12;
-; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
-; CHECKPTX71-NEXT: atom.relaxed.sys.shared.cas.b32 %r13, [%r10], %r49, %r45;
-; CHECKPTX71-NEXT: setp.ne.b32 %p4, %r13, %r49;
-; CHECKPTX71-NEXT: mov.b32 %r49, %r13;
-; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
-; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end
-; CHECKPTX71-NEXT: ret;
- %r1 = atomicrmw fadd ptr %dp0, bfloat %val monotonic
- %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 monotonic
- %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val monotonic
- %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val monotonic
- ret void
-}
-
-attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index 6ea02f35e9626..48d38f565c3ef 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -2,349 +2,6 @@
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_32 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_32 | %ptxas-verify %}
-
-; CHECK-LABEL: atom0
-define i32 @atom0(ptr %addr, i32 %val) {
-; CHECK-LABEL: atom0(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom0_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atom0_param_1];
-; CHECK-NEXT: atom.add.u32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw add ptr %addr, i32 %val seq_cst
- ret i32 %ret
-}
-
-; CHECK-LABEL: atom1
-define i64 @atom1(ptr %addr, i64 %val) {
-; CHECK-LABEL: atom1(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom1_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [atom1_param_1];
-; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
- %ret = atomicrmw add ptr %addr, i64 %val seq_cst
- ret i64 %ret
-}
-
-; CHECK-LABEL: atom2
-define i32 @atom2(ptr %subr, i32 %val) {
-; CHECK-LABEL: atom2(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<4>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom2_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atom2_param_1];
-; CHECK-NEXT: neg.s32 %r2, %r1;
-; CHECK-NEXT: atom.add.u32 %r3, [%rd1], %r2;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
-; CHECK-NEXT: ret;
- %ret = atomicrmw sub ptr %subr, i32 %val seq_cst
- ret i32 %ret
-}
-
-; CHECK-LABEL: atom3
-define i64 @atom3(ptr %subr, i64 %val) {
-; CHECK-LABEL: atom3(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom3_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [atom3_param_1];
-; CHECK-NEXT: neg.s64 %rd3, %rd2;
-; CHECK-NEXT: atom.add.u64 %rd4, [%rd1], %rd3;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd4;
-; CHECK-NEXT: ret;
- %ret = atomicrmw sub ptr %subr, i64 %val seq_cst
- ret i64 %ret
-}
-
-; CHECK-LABEL: atom4
-define i32 @atom4(ptr %subr, i32 %val) {
-; CHECK-LABEL: atom4(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom4_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atom4_param_1];
-; CHECK-NEXT: atom.and.b32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw and ptr %subr, i32 %val seq_cst
- ret i32 %ret
-}
-
-; CHECK-LABEL: atom5
-define i64 @atom5(ptr %subr, i64 %val) {
-; CHECK-LABEL: atom5(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom5_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [atom5_param_1];
-; CHECK-NEXT: atom.and.b64 %rd3, [%rd1], %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
- %ret = atomicrmw and ptr %subr, i64 %val seq_cst
- ret i64 %ret
-}
-
-;; NAND not yet supported
-;define i32 @atom6(ptr %subr, i32 %val) {
-; %ret = atomicrmw nand ptr %subr, i32 %val seq_cst
-; ret i32 %ret
-;}
-
-;define i64 @atom7(ptr %subr, i64 %val) {
-; %ret = atomicrmw nand ptr %subr, i64 %val seq_cst
-; ret i64 %ret
-;}
-
-; CHECK-LABEL: atom8
-define i32 @atom8(ptr %subr, i32 %val) {
-; CHECK-LABEL: atom8(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom8_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atom8_param_1];
-; CHECK-NEXT: atom.or.b32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw or ptr %subr, i32 %val seq_cst
- ret i32 %ret
-}
-
-; CHECK-LABEL: atom9
-define i64 @atom9(ptr %subr, i64 %val) {
-; CHECK-LABEL: atom9(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom9_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [atom9_param_1];
-; CHECK-NEXT: atom.or.b64 %rd3, [%rd1], %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
- %ret = atomicrmw or ptr %subr, i64 %val seq_cst
- ret i64 %ret
-}
-
-; CHECK-LABEL: atom10
-define i32 @atom10(ptr %subr, i32 %val) {
-; CHECK-LABEL: atom10(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom10_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atom10_param_1];
-; CHECK-NEXT: atom.xor.b32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw xor ptr %subr, i32 %val seq_cst
- ret i32 %ret
-}
-
-; CHECK-LABEL: atom11
-define i64 @atom11(ptr %subr, i64 %val) {
-; CHECK-LABEL: atom11(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom11_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [atom11_param_1];
-; CHECK-NEXT: atom.xor.b64 %rd3, [%rd1], %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
- %ret = atomicrmw xor ptr %subr, i64 %val seq_cst
- ret i64 %ret
-}
-
-; CHECK-LABEL: atom12
-define i32 @atom12(ptr %subr, i32 %val) {
-; CHECK-LABEL: atom12(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom12_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atom12_param_1];
-; CHECK-NEXT: atom.max.s32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw max ptr %subr, i32 %val seq_cst
- ret i32 %ret
-}
-
-; CHECK-LABEL: atom13
-define i64 @atom13(ptr %subr, i64 %val) {
-; CHECK-LABEL: atom13(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom13_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [atom13_param_1];
-; CHECK-NEXT: atom.max.s64 %rd3, [%rd1], %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
- %ret = atomicrmw max ptr %subr, i64 %val seq_cst
- ret i64 %ret
-}
-
-; CHECK-LABEL: atom14
-define i32 @atom14(ptr %subr, i32 %val) {
-; CHECK-LABEL: atom14(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom14_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atom14_param_1];
-; CHECK-NEXT: atom.min.s32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw min ptr %subr, i32 %val seq_cst
- ret i32 %ret
-}
-
-; CHECK-LABEL: atom15
-define i64 @atom15(ptr %subr, i64 %val) {
-; CHECK-LABEL: atom15(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom15_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [atom15_param_1];
-; CHECK-NEXT: atom.min.s64 %rd3, [%rd1], %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
- %ret = atomicrmw min ptr %subr, i64 %val seq_cst
- ret i64 %ret
-}
-
-; CHECK-LABEL: atom16
-define i32 @atom16(ptr %subr, i32 %val) {
-; CHECK-LABEL: atom16(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom16_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atom16_param_1];
-; CHECK-NEXT: atom.max.u32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw umax ptr %subr, i32 %val seq_cst
- ret i32 %ret
-}
-
-; CHECK-LABEL: atom17
-define i64 @atom17(ptr %subr, i64 %val) {
-; CHECK-LABEL: atom17(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom17_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [atom17_param_1];
-; CHECK-NEXT: atom.max.u64 %rd3, [%rd1], %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
- %ret = atomicrmw umax ptr %subr, i64 %val seq_cst
- ret i64 %ret
-}
-
-; CHECK-LABEL: atom18
-define i32 @atom18(ptr %subr, i32 %val) {
-; CHECK-LABEL: atom18(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom18_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atom18_param_1];
-; CHECK-NEXT: atom.min.u32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw umin ptr %subr, i32 %val seq_cst
- ret i32 %ret
-}
-
-; CHECK-LABEL: atom19
-define i64 @atom19(ptr %subr, i64 %val) {
-; CHECK-LABEL: atom19(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<4>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom19_param_0];
-; CHECK-NEXT: ld.param.b64 %rd2, [atom19_param_1];
-; CHECK-NEXT: atom.min.u64 %rd3, [%rd1], %rd2;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
- %ret = atomicrmw umin ptr %subr, i64 %val seq_cst
- ret i64 %ret
-}
-
-define i32 @atom20(ptr %subr, i32 %val) {
-; CHECK-LABEL: atom20(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom20_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atom20_param_1];
-; CHECK-NEXT: atom.inc.u32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw uinc_wrap ptr %subr, i32 %val seq_cst
- ret i32 %ret
-}
-
-define i32 @atom21(ptr %subr, i32 %val) {
-; CHECK-LABEL: atom21(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atom21_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atom21_param_1];
-; CHECK-NEXT: atom.dec.u32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw udec_wrap ptr %subr, i32 %val seq_cst
- ret i32 %ret
-}
-
declare float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val)
; CHECK-LABEL: atomic_add_f32_generic
@@ -356,6 +13,7 @@ define float @atomic_add_f32_generic(ptr %addr, float %val) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [atomic_add_f32_generic_param_0];
+; CHECK-NEXT: membar.sys;
; CHECK-NEXT: ld.param.b32 %r1, [atomic_add_f32_generic_param_1];
; CHECK-NEXT: atom.add.f32 %r2, [%rd1], %r1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
@@ -375,6 +33,7 @@ define float @atomic_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [atomic_add_f32_addrspace1_param_0];
+; CHECK-NEXT: membar.sys;
; CHECK-NEXT: ld.param.b32 %r1, [atomic_add_f32_addrspace1_param_1];
; CHECK-NEXT: atom.global.add.f32 %r2, [%rd1], %r1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
@@ -394,6 +53,7 @@ define float @atomic_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [atomic_add_f32_addrspace3_param_0];
+; CHECK-NEXT: membar.sys;
; CHECK-NEXT: ld.param.b32 %r1, [atomic_add_f32_addrspace3_param_1];
; CHECK-NEXT: atom.shared.add.f32 %r2, [%rd1], %r1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
@@ -401,136 +61,3 @@ define float @atomic_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
%ret = call float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %val)
ret float %ret
}
-
-; CHECK-LABEL: atomicrmw_add_f32_generic
-define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
-; CHECK-LABEL: atomicrmw_add_f32_generic(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atomicrmw_add_f32_generic_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atomicrmw_add_f32_generic_param_1];
-; CHECK-NEXT: atom.add.f32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw fadd ptr %addr, float %val seq_cst
- ret float %ret
-}
-
-; CHECK-LABEL: atomicrmw_add_f16_generic
-define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
-; CHECK-LABEL: atomicrmw_add_f16_generic(
-; CHECK: {
-; CHECK-NEXT: .reg .pred %p<2>;
-; CHECK-NEXT: .reg .b16 %rs<4>;
-; CHECK-NEXT: .reg .b32 %r<18>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b16 %rs1, [atomicrmw_add_f16_generic_param_1];
-; CHECK-NEXT: ld.param.b64 %rd2, [atomicrmw_add_f16_generic_param_0];
-; CHECK-NEXT: and.b64 %rd1, %rd2, -4;
-; CHECK-NEXT: cvt.u32.u64 %r4, %rd2;
-; CHECK-NEXT: and.b32 %r5, %r4, 3;
-; CHECK-NEXT: shl.b32 %r1, %r5, 3;
-; CHECK-NEXT: mov.b32 %r6, 65535;
-; CHECK-NEXT: shl.b32 %r7, %r6, %r1;
-; CHECK-NEXT: not.b32 %r2, %r7;
-; CHECK-NEXT: ld.b32 %r17, [%rd1];
-; CHECK-NEXT: cvt.f32.f16 %r10, %rs1;
-; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start
-; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: shr.u32 %r8, %r17, %r1;
-; CHECK-NEXT: cvt.u16.u32 %rs2, %r8;
-; CHECK-NEXT: cvt.f32.f16 %r9, %rs2;
-; CHECK-NEXT: add.rn.f32 %r11, %r9, %r10;
-; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r11;
-; CHECK-NEXT: cvt.u32.u16 %r12, %rs3;
-; CHECK-NEXT: shl.b32 %r13, %r12, %r1;
-; CHECK-NEXT: and.b32 %r14, %r17, %r2;
-; CHECK-NEXT: or.b32 %r15, %r14, %r13;
-; CHECK-NEXT: membar.sys;
-; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r17, %r15;
-; CHECK-NEXT: setp.ne.b32 %p1, %r3, %r17;
-; CHECK-NEXT: mov.b32 %r17, %r3;
-; CHECK-NEXT: @%p1 bra $L__BB24_1;
-; CHECK-NEXT: // %bb.2: // %atomicrmw.end
-; CHECK-NEXT: shr.u32 %r16, %r3, %r1;
-; CHECK-NEXT: st.param.b16 [func_retval0], %r16;
-; CHECK-NEXT: ret;
- %ret = atomicrmw fadd ptr %addr, half %val seq_cst
- ret half %ret
-}
-
-; CHECK-LABEL: atomicrmw_add_f32_addrspace1
-define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
-; CHECK-LABEL: atomicrmw_add_f32_addrspace1(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atomicrmw_add_f32_addrspace1_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atomicrmw_add_f32_addrspace1_param_1];
-; CHECK-NEXT: atom.global.add.f32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw fadd ptr addrspace(1) %addr, float %val seq_cst
- ret float %ret
-}
-
-; CHECK-LABEL: atomicrmw_add_f32_addrspace3
-define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
-; CHECK-LABEL: atomicrmw_add_f32_addrspace3(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<3>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atomicrmw_add_f32_addrspace3_param_0];
-; CHECK-NEXT: ld.param.b32 %r1, [atomicrmw_add_f32_addrspace3_param_1];
-; CHECK-NEXT: atom.shared.add.f32 %r2, [%rd1], %r1;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %ret = atomicrmw fadd ptr addrspace(3) %addr, float %val seq_cst
- ret float %ret
-}
-
-; CHECK-LABEL: atomic_cmpxchg_i32
-define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) {
-; CHECK-LABEL: atomic_cmpxchg_i32(
-; CHECK: {
-; CHECK-NEXT: .reg .b32 %r<4>;
-; CHECK-NEXT: .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atomic_cmpxchg_i32_param_0];
-; CHECK-NEXT: membar.sys;
-; CHECK-NEXT: ld.param.b32 %r1, [atomic_cmpxchg_i32_param_1];
-; CHECK-NEXT: ld.param.b32 %r2, [atomic_cmpxchg_i32_param_2];
-; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
- ret i32 %new
-}
-
-; CHECK-LABEL: atomic_cmpxchg_i64
-define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) {
-; CHECK-LABEL: atomic_cmpxchg_i64(
-; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<5>;
-; CHECK-EMPTY:
-; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b64 %rd1, [atomic_cmpxchg_i64_param_0];
-; CHECK-NEXT: membar.sys;
-; CHECK-NEXT: ld.param.b64 %rd2, [atomic_cmpxchg_i64_param_1];
-; CHECK-NEXT: ld.param.b64 %rd3, [atomic_cmpxchg_i64_param_2];
-; CHECK-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
-; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
-; CHECK-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
- ret i64 %new
-}
diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
index 01cd70d1530b0..6641035a416c2 100644
--- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
+++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll
@@ -64,12 +64,16 @@ define void @test_distributed_shared_cluster_float_atomic(ptr addrspace(7) %dsme
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0];
+; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: mov.b16 %rs1, 0x3C00;
-; CHECK-NEXT: atom.shared::cluster.add.noftz.f16 %rs2, [%rd1], %rs1;
+; CHECK-NEXT: atom.acquire.sys.shared::cluster.add.noftz.f16 %rs2, [%rd1], %rs1;
+; CHECK-NEXT: fence.sc.sys;
; CHECK-NEXT: mov.b16 %rs3, 0x3F80;
-; CHECK-NEXT: atom.shared::cluster.add.noftz.bf16 %rs4, [%rd1], %rs3;
-; CHECK-NEXT: atom.shared::cluster.add.f32 %r1, [%rd1], 0f3F800000;
-; CHECK-NEXT: atom.shared::cluster.add.f64 %rd2, [%rd1], 0d3FF0000000000000;
+; CHECK-NEXT: atom.acquire.sys.shared::cluster.add.noftz.bf16 %rs4, [%rd1], %rs3;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.sys.shared::cluster.add.f32 %r1, [%rd1], 0f3F800000;
+; CHECK-NEXT: fence.sc.sys;
+; CHECK-NEXT: atom.acquire.sys.shared::cluster.add.f64 %rd2, [%rd1], 0d3FF0000000000000;
; CHECK-NEXT: ret;
entry:
; Floating point atomic operations
@@ -90,20 +94,20 @@ define void @test_distributed_shared_cluster_int_atomic(ptr addrspace(7) %dsmem_
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_int_atomic_param_0];
-; CHECK-NEXT: atom.shared::cluster.add.u32 %r1, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.add.u64 %rd2, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.exch.b32 %r2, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.exch.b64 %rd3, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.min.s32 %r3, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.min.s64 %rd4, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.min.u32 %r4, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.min.u64 %rd5, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.max.s32 %r5, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.max.s64 %rd6, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.max.u32 %r6, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.max.u64 %rd7, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.inc.u32 %r7, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.dec.u32 %r8, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.add.u32 %r1, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.add.u64 %rd2, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.exch.b32 %r2, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.exch.b64 %rd3, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.min.s32 %r3, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.min.s64 %rd4, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.min.u32 %r4, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.min.u64 %rd5, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.max.s32 %r5, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.max.s64 %rd6, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.max.u32 %r6, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.max.u64 %rd7, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.inc.u32 %r7, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.dec.u32 %r8, [%rd1], 1;
; CHECK-NEXT: ret;
entry:
; Integer add operations
@@ -142,12 +146,12 @@ define void @test_distributed_shared_cluster_bitwise_atomic(ptr addrspace(7) %ds
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_bitwise_atomic_param_0];
-; CHECK-NEXT: atom.shared::cluster.and.b32 %r1, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.and.b64 %rd2, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.or.b32 %r2, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.or.b64 %rd3, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.xor.b32 %r3, [%rd1], 1;
-; CHECK-NEXT: atom.shared::cluster.xor.b64 %rd4, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.and.b32 %r1, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.and.b64 %rd2, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.or.b32 %r2, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.or.b64 %rd3, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.xor.b32 %r3, [%rd1], 1;
+; CHECK-NEXT: atom.relaxed.sys.shared::cluster.xor.b64 %rd4, [%rd1], 1;
; CHECK-NEXT: ret;
entry:
; Bitwise operations
diff --git a/llvm/test/CodeGen/NVPTX/lit.local.cfg b/llvm/test/CodeGen/NVPTX/lit.local.cfg
index 84cce669ec10b..dede5b354bb85 100644
--- a/llvm/test/CodeGen/NVPTX/lit.local.cfg
+++ b/llvm/test/CodeGen/NVPTX/lit.local.cfg
@@ -1,4 +1,4 @@
if not "NVPTX" in config.root.targets:
config.unsupported = True
config.suffixes.add(".py")
-config.excludes = ["fence.py", "cmpxchg.py"]
+config.excludes = ["fence.py", "cmpxchg.py", "atomicrmw.py"]
>From f34f61faa6b9b67fd45fdd0db5a2699bb9c77d05 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Tue, 3 Feb 2026 02:35:52 +0000
Subject: [PATCH 2/2] Check arch and sm version availability for atomicrmw
tests
---
llvm/test/CodeGen/NVPTX/atomicrmw-sm60.ll | 2 +-
llvm/test/CodeGen/NVPTX/atomicrmw-sm70.ll | 2 +-
llvm/test/CodeGen/NVPTX/atomicrmw-sm90.ll | 2 +-
llvm/test/CodeGen/NVPTX/atomicrmw.py | 4 ++--
4 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-sm60.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-sm60.ll
index 49d8701f25e46..7509cb53e424c 100644
--- a/llvm/test/CodeGen/NVPTX/atomicrmw-sm60.ll
+++ b/llvm/test/CodeGen/NVPTX/atomicrmw-sm60.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | FileCheck %s --check-prefix=SM60
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %}
+; RUN: %if ptxas-sm_60 && ptxas-isa-5.0 %{ llc < %s -march=nvptx64 -mcpu=sm_60 -mattr=+ptx50 | %ptxas-verify -arch=sm_60 %}
define i8 @xchg_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM60-LABEL: xchg_acq_rel_i8_global_cta(
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-sm70.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-sm70.ll
index 0bcc69f34d432..ec058567e9ec7 100644
--- a/llvm/test/CodeGen/NVPTX/atomicrmw-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomicrmw-sm70.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | FileCheck %s --check-prefix=SM70
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
+; RUN: %if ptxas-sm_70 && ptxas-isa-6.3 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx63 | %ptxas-verify -arch=sm_70 %}
define i8 @xchg_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM70-LABEL: xchg_acq_rel_i8_global_cta(
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw-sm90.ll b/llvm/test/CodeGen/NVPTX/atomicrmw-sm90.ll
index 82680b3b1aeec..bc918023b2658 100644
--- a/llvm/test/CodeGen/NVPTX/atomicrmw-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomicrmw-sm90.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.7 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
define i8 @xchg_acq_rel_i8_global_cta(ptr addrspace(1) %addr, i8 %val) {
; SM90-LABEL: xchg_acq_rel_i8_global_cta(
diff --git a/llvm/test/CodeGen/NVPTX/atomicrmw.py b/llvm/test/CodeGen/NVPTX/atomicrmw.py
index 6417891c56c57..f9c9362134af0 100644
--- a/llvm/test/CodeGen/NVPTX/atomicrmw.py
+++ b/llvm/test/CodeGen/NVPTX/atomicrmw.py
@@ -45,7 +45,7 @@
run_statement = Template(
"""; RUN: llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | FileCheck %s --check-prefix=SM${sm}
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %}
+; RUN: %if ptxas-sm_${sm} && ptxas-isa-${ptxfp} %{ llc < %s -march=nvptx64 -mcpu=sm_${sm} -mattr=+ptx${ptx} | %ptxas-verify -arch=sm_${sm} %}
"""
)
@@ -61,7 +61,7 @@ def get_addrspace_cast(addrspace):
for sm, ptx in TESTS:
# Slice 1: Keep addrspace, llvm_scope, ordering fixed, generate all possible operations and sizes
with open("atomicrmw-sm{}.ll".format(str(sm)), "w") as fp:
- print(run_statement.substitute(sm=sm, ptx=ptx), file=fp)
+ print(run_statement.substitute(sm=sm, ptx=ptx, ptxfp=ptx / 10.0), file=fp)
# Integer operations
addrspace, llvm_scope, ordering = 1, "block", "acq_rel"
for operation, datatype in product(
More information about the llvm-commits
mailing list