[llvm] [NVPTX] Load/Store/Fence syncscope support (PR #106101)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 19 07:33:44 PDT 2024
https://github.com/gonzalobg updated https://github.com/llvm/llvm-project/pull/106101
>From ca1bf2ffb21dc8d5b6174c1e4bdc86d84f480ff5 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Sat, 17 Aug 2024 10:03:35 -0700
Subject: [PATCH 01/15] [NVPTX] Load/Store syncscope support
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 62 +-
llvm/lib/Target/NVPTX/NVPTX.h | 14 +-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 178 +-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 21 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 248 +-
.../Target/NVPTX/NVPTXReplaceImageHandles.cpp | 7 +-
llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp | 12 +
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 3 +
llvm/lib/Target/NVPTX/NVPTXUtilities.h | 33 +-
.../floating-point-immediate-operands.mir | 8 +-
llvm/test/CodeGen/NVPTX/fence-sm-90.ll | 30 +
llvm/test/CodeGen/NVPTX/fence.ll | 76 +-
llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 3376 +++++++++++++++--
llvm/test/CodeGen/NVPTX/load-store-sm-90.ll | 1423 +++++++
llvm/test/CodeGen/NVPTX/load-store.ll | 507 ++-
15 files changed, 5263 insertions(+), 735 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/fence-sm-90.ll
create mode 100644 llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 5b568b0487b45a..2a44ce0273ee1b 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -233,46 +233,68 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
auto Ordering = NVPTX::Ordering(Imm);
switch (Ordering) {
case NVPTX::Ordering::NotAtomic:
- break;
- case NVPTX::Ordering::Volatile:
- O << ".volatile";
- break;
+ return;
case NVPTX::Ordering::Relaxed:
- O << ".relaxed.sys";
- break;
+ O << ".relaxed";
+ return;
case NVPTX::Ordering::Acquire:
- O << ".acquire.sys";
- break;
+ O << ".acquire";
+ return;
case NVPTX::Ordering::Release:
- O << ".release.sys";
- break;
+ O << ".release";
+ return;
+ case NVPTX::Ordering::Volatile:
+ O << ".volatile";
+ return;
case NVPTX::Ordering::RelaxedMMIO:
- O << ".mmio.relaxed.sys";
- break;
+ O << ".mmio.relaxed";
+ return;
default:
report_fatal_error(formatv(
- "NVPTX LdStCode Printer does not support \"{}\" sem modifier.",
- OrderingToCString(Ordering)));
+ "NVPTX LdStCode Printer does not support \"{}\" sem modifier. "
+ "Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.",
+ OrderingToString(Ordering)));
+ }
+ } else if (!strcmp(Modifier, "sco")) {
+ auto S = NVPTX::Scope(Imm);
+ switch (S) {
+ case NVPTX::Scope::Thread:
+ return;
+ case NVPTX::Scope::System:
+ O << ".sys";
+ return;
+ case NVPTX::Scope::Block:
+ O << ".cta";
+ return;
+ case NVPTX::Scope::Cluster:
+ O << ".cluster";
+ return;
+ case NVPTX::Scope::Device:
+ O << ".gpu";
+ return;
}
+ report_fatal_error(formatv(
+ "NVPTX LdStCode Printer does not support \"{}\" sco modifier.",
+ ScopeToString(S)));
} else if (!strcmp(Modifier, "addsp")) {
switch (Imm) {
case NVPTX::PTXLdStInstCode::GLOBAL:
O << ".global";
- break;
+ return;
case NVPTX::PTXLdStInstCode::SHARED:
O << ".shared";
- break;
+ return;
case NVPTX::PTXLdStInstCode::LOCAL:
O << ".local";
- break;
+ return;
case NVPTX::PTXLdStInstCode::PARAM:
O << ".param";
- break;
+ return;
case NVPTX::PTXLdStInstCode::CONSTANT:
O << ".const";
- break;
+ return;
case NVPTX::PTXLdStInstCode::GENERIC:
- break;
+ return;
default:
llvm_unreachable("Wrong Address Space");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index f6f6acb9e13c90..b5624f9212ea27 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -117,12 +117,22 @@ enum Ordering : OrderingUnderlyingType {
// Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire"
Acquire = (OrderingUnderlyingType)AtomicOrdering::Acquire,
Release = (OrderingUnderlyingType)AtomicOrdering::Release,
- // AcquireRelease = 6, // TODO
+ AcquireRelease = (OrderingUnderlyingType)AtomicOrdering::AcquireRelease,
SequentiallyConsistent =
(OrderingUnderlyingType)AtomicOrdering::SequentiallyConsistent,
Volatile = SequentiallyConsistent + 1,
RelaxedMMIO = Volatile + 1,
- LAST = RelaxedMMIO
+ LASTORDERING = RelaxedMMIO
+};
+
+using ScopeUnderlyingType = unsigned int;
+enum Scope : ScopeUnderlyingType {
+ Thread = 0,
+ System = 1,
+ Block = 2,
+ Cluster = 3,
+ Device = 4,
+ LASTSCOPE = Device
};
namespace PTXLdStInstCode {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 4f0bc1a2044642..f04796fcdd49fe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -59,6 +59,7 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
+ Scopes = NVPTXScopes(MF.getFunction().getContext());
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -106,6 +107,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (tryStore(N))
return;
break;
+ case ISD::ATOMIC_FENCE:
+ if (tryFence(N))
+ return;
+ break;
case ISD::EXTRACT_VECTOR_ELT:
if (tryEXTRACT_VECTOR_ELEMENT(N))
return;
@@ -915,6 +920,42 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
} // namespace
+NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
+ NVPTX::Ordering Ord) const {
+ switch (Ord) {
+ case NVPTX::Ordering::NotAtomic:
+ case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
+ // NVPTX uses Thread scope as the scope of non-atomic operations.
+ return NVPTX::Scope::Thread;
+ case NVPTX::Ordering::RelaxedMMIO:
+ // RelaxedMMIO operations are always system scope.
+ // If a RelaxedMMIO order was generated from an atomic volatile operation
+ // with a smaller thread scope, we bump it here to system scope.
+ return NVPTX::Scope::System;
+ case NVPTX::Ordering::Relaxed:
+ case NVPTX::Ordering::Acquire:
+ case NVPTX::Ordering::Release:
+ case NVPTX::Ordering::AcquireRelease:
+ case NVPTX::Ordering::SequentiallyConsistent:
+ auto S = Scopes[N->getSyncScopeID()];
+
+ // Atomic operations must have a scope greater than thread.
+ if (S == NVPTX::Scope::Thread)
+ report_fatal_error(
+ formatv("Atomics need scope > \"{}\".", ScopeToString(S)));
+
+ // If scope is cluster, clusters must be supported.
+ if (S == NVPTX::Scope::Cluster)
+ Subtarget->requireClusters("cluster scope");
+
+ // If operation is volatile, then its scope is system.
+ if (N->isVolatile())
+ S = NVPTX::Scope::System;
+
+ return S;
+ }
+}
+
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
unsigned CodeAddrSpace, MachineFunction *F) {
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -957,33 +998,86 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
});
}
-NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL,
- SDValue &Chain,
- MemSDNode *N) {
+static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
+ NVPTXSubtarget const *T) {
+ if (S == NVPTX::Scope::Cluster)
+ T->requireClusters(".cluster scope fence");
+
+ switch (O) {
+ case NVPTX::Ordering::Acquire:
+ case NVPTX::Ordering::Release:
+ case NVPTX::Ordering::AcquireRelease: {
+ switch (S) {
+ case NVPTX::Scope::System:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
+ : NVPTX::INT_MEMBAR_SYS;
+ case NVPTX::Scope::Block:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
+ : NVPTX::INT_MEMBAR_CTA;
+ case NVPTX::Scope::Cluster:
+ return NVPTX::atomic_thread_fence_acq_rel_cluster;
+ case NVPTX::Scope::Device:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
+ : NVPTX::INT_MEMBAR_GL;
+ case NVPTX::Scope::Thread:
+ report_fatal_error(
+ formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
+ ScopeToString(S)));
+ }
+ }
+ case NVPTX::Ordering::SequentiallyConsistent: {
+ switch (S) {
+ case NVPTX::Scope::System:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
+ : NVPTX::INT_MEMBAR_SYS;
+ case NVPTX::Scope::Block:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
+ : NVPTX::INT_MEMBAR_CTA;
+ case NVPTX::Scope::Cluster:
+ return NVPTX::atomic_thread_fence_seq_cst_cluster;
+ case NVPTX::Scope::Device:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
+ : NVPTX::INT_MEMBAR_GL;
+ case NVPTX::Scope::Thread:
+ report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
+ ScopeToString(S)));
+ }
+ }
+ case NVPTX::Ordering::NotAtomic:
+ case NVPTX::Ordering::Relaxed:
+ case NVPTX::Ordering::Volatile:
+ case NVPTX::Ordering::RelaxedMMIO:
+ report_fatal_error(
+ formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.",
+ OrderingToString(O), ScopeToString(S)));
+ }
+}
+
+std::pair<NVPTX::Ordering, NVPTX::Scope>
+NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
+ MemSDNode *N) {
// Some memory instructions - loads, stores, atomics - need an extra fence
// instruction. Get the memory order of the instruction, and that of its
// fence, if any.
auto [InstructionOrdering, FenceOrdering] =
getOperationOrderings(N, Subtarget);
+ auto Scope = getOperationScope(N, InstructionOrdering);
// If a fence is required before the operation, insert it:
switch (NVPTX::Ordering(FenceOrdering)) {
case NVPTX::Ordering::NotAtomic:
break;
case NVPTX::Ordering::SequentiallyConsistent: {
- unsigned Op = Subtarget->hasMemoryOrdering()
- ? NVPTX::atomic_thread_fence_seq_cst_sys
- : NVPTX::INT_MEMBAR_SYS;
+ auto Op = getFenceOp(FenceOrdering, Scope, Subtarget);
Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
break;
}
default:
report_fatal_error(
formatv("Unexpected fence ordering: \"{}\".",
- OrderingToCString(NVPTX::Ordering(FenceOrdering))));
+ OrderingToString(NVPTX::Ordering(FenceOrdering))));
}
-
- return InstructionOrdering;
+ return std::make_pair(InstructionOrdering, Scope);
}
bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
@@ -1154,7 +1248,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
- auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, LD);
+ auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
// Type Setting: fromType + fromTypeWidth
//
@@ -1189,7 +1283,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
std::optional<unsigned> Opcode;
MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
- SmallVector<SDValue, 12> Ops({getI32Imm(InstructionOrdering, DL),
+ SmallVector<SDValue, 12> Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL), getI32Imm(FromType, DL),
getI32Imm(FromTypeWidth, DL)});
@@ -1266,7 +1360,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
- auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD);
+ auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
// Vector Setting
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1319,7 +1413,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
std::optional<unsigned> Opcode;
SDNode *LD;
- SmallVector<SDValue, 12> Ops({getI32Imm(InstructionOrdering, DL),
+ SmallVector<SDValue, 12> Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL), getI32Imm(FromType, DL),
getI32Imm(FromTypeWidth, DL)});
@@ -1895,7 +1989,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
SDLoc DL(N);
SDValue Chain = ST->getChain();
- auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, ST);
+ auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
// Vector Setting
MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1923,10 +2017,10 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
MVT::SimpleValueType SourceVT =
Value.getNode()->getSimpleValueType(0).SimpleTy;
- SmallVector<SDValue, 12> Ops({Value, getI32Imm(InstructionOrdering, DL),
- getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(ToType, DL),
- getI32Imm(ToTypeWidth, DL)});
+ SmallVector<SDValue, 12> Ops(
+ {Value, getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
+ getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
+ getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)});
if (SelectDirectAddr(BasePtr, Addr)) {
Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
@@ -2005,7 +2099,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
- auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD);
+ auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
@@ -2044,9 +2138,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
ToTypeWidth = 32;
}
- Ops.append({getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(ToType, DL),
- getI32Imm(ToTypeWidth, DL)});
+ Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
+ getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
+ getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)});
if (SelectDirectAddr(N2, Addr)) {
switch (N->getOpcode()) {
@@ -4064,3 +4158,43 @@ unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
}
}
}
+
+bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
+ SDLoc DL(N);
+ assert(N->getOpcode() == ISD::ATOMIC_FENCE);
+ unsigned int FenceOp =
+ getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)),
+ Scopes[N->getConstantOperandVal(2)], Subtarget);
+ SDValue Chain = N->getOperand(0);
+ SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain);
+ ReplaceNode(N, FenceNode);
+ return true;
+}
+
+NVPTXScopes::NVPTXScopes(LLVMContext &C) : CTX(&C) {
+ Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
+ Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
+ Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
+ Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster;
+ Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device;
+}
+
+NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
+ if (Scopes.empty())
+ report_fatal_error("NVPTX Scopes must be initialized before calling "
+ "NVPTXScopes::operator[]");
+
+ auto S = Scopes.find(ID);
+ if (S == Scopes.end()) {
+ SmallVector<StringRef, 8> ScopeNames;
+ assert(CTX != nullptr && "CTX is nullptr");
+ CTX->getSyncScopeNames(ScopeNames);
+ StringRef Unknown{"unknown"};
+ auto Name = ID < ScopeNames.size() ? ScopeNames[ID] : Unknown;
+ report_fatal_error(
+ formatv("Could not find scope ID={} with name \"{}\".", int(ID), Name));
+ }
+ return S->second;
+}
+
+bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index eac4056599511c..7eccf9e45314b1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -18,13 +18,26 @@
#include "NVPTXISelLowering.h"
#include "NVPTXRegisterInfo.h"
#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/Compiler.h"
namespace llvm {
+struct NVPTXScopes {
+ NVPTXScopes() = default;
+ NVPTXScopes(LLVMContext &C);
+ NVPTX::Scope operator[](SyncScope::ID ID) const;
+ bool empty() const;
+
+private:
+ SmallMapVector<SyncScope::ID, NVPTX::Scope, 8> Scopes{};
+ LLVMContext *CTX = nullptr;
+};
+
class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
const NVPTXTargetMachine &TM;
@@ -38,6 +51,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool allowUnsafeFPMath() const;
bool doRsqrtOpt() const;
+ NVPTXScopes Scopes{};
+
public:
NVPTXDAGToDAGISel() = delete;
@@ -66,6 +81,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryLoadParam(SDNode *N);
bool tryStoreRetval(SDNode *N);
bool tryStoreParam(SDNode *N);
+ bool tryFence(SDNode *N);
void SelectAddrSpaceCast(SDNode *N);
bool tryTextureIntrinsic(SDNode *N);
bool trySurfaceIntrinsic(SDNode *N);
@@ -100,8 +116,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, LoadSDNode *N);
- NVPTX::Ordering insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
- MemSDNode *N);
+ std::pair<NVPTX::Ordering, NVPTX::Scope>
+ insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, MemSDNode *N);
+ NVPTX::Scope getOperationScope(MemSDNode *N, NVPTX::Ordering O) const;
};
class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b7e210805db904..963c1de07bab74 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2971,39 +2971,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
multiclass LD<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _ari : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _ari_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _asi : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
}
@@ -3019,39 +3019,42 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _ari : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr,
+ i32imm:$offset),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr,
+ i32imm:$offset),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _asi : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr,
+ i32imm:$offset),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
}
@@ -3070,75 +3073,75 @@ let mayStore=1, hasSideEffects=0 in {
multiclass LD_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v4_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
}
let mayLoad=1, hasSideEffects=0 in {
@@ -3153,84 +3156,87 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ imem:$addr),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int32Regs:$addr),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int64Regs:$addr),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_ari : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
- i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int32Regs:$addr, i32imm:$offset),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
- i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int64Regs:$addr, i32imm:$offset),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_asi : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
- i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ imem:$addr, i32imm:$offset),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v4_avar : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_asi : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+ LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}"
"$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
}
@@ -4003,17 +4009,23 @@ def atomic_thread_fence_acq_rel_sys :
NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
Requires<[hasPTX<60>, hasSM<70>]>;
-def : Pat<(atomic_fence (i64 4), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acquire(4) sys(1)
- Requires<[hasPTX<60>, hasSM<70>]>;
-def : Pat<(atomic_fence (i64 5), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // release(5) sys(1)
- Requires<[hasPTX<60>, hasSM<70>]>;
-def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acq_rel(6) sys(1)
- Requires<[hasPTX<60>, hasSM<70>]>;
-def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys)>, // seq_cst(7) sys(1)
- Requires<[hasPTX<60>, hasSM<70>]>;
-
-// If PTX<60 or SM<70, we fall back to MEMBAR:
-def : Pat<(atomic_fence (i64 4), (i64 1)), (INT_MEMBAR_SYS)>; // acquire(4) sys(1)
-def : Pat<(atomic_fence (i64 5), (i64 1)), (INT_MEMBAR_SYS)>; // release(5) sys(1)
-def : Pat<(atomic_fence (i64 6), (i64 1)), (INT_MEMBAR_SYS)>; // acq_rel(6) sys(1)
-def : Pat<(atomic_fence (i64 7), (i64 1)), (INT_MEMBAR_SYS)>; // seq_cst(7) sys(1)
+def atomic_thread_fence_seq_cst_gpu :
+ NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+def atomic_thread_fence_acq_rel_gpu :
+ NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+
+def atomic_thread_fence_seq_cst_cluster :
+ NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+def atomic_thread_fence_acq_rel_cluster :
+ NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+
+def atomic_thread_fence_seq_cst_cta :
+ NVPTXInst<(outs), (ins), "fence.sc.cta;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+def atomic_thread_fence_acq_rel_cta :
+ NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
\ No newline at end of file
diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index f2515f971595bf..f66504b09cb63f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -11,12 +11,11 @@
// to work reliably, inlining of all function call must be performed.
//
//===----------------------------------------------------------------------===//
-
+#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTX.h"
#include "NVPTXMachineFunctionInfo.h"
#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
-#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -1820,8 +1819,8 @@ findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
return false;
}
- assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!");
- StringRef Sym = TexHandleDef.getOperand(6).getSymbolName();
+ assert(TexHandleDef.getOperand(7).isSymbol() && "Load is not a symbol!");
+ StringRef Sym = TexHandleDef.getOperand(7).getSymbolName();
std::string ParamBaseName = std::string(MF.getName());
ParamBaseName += "_param_";
assert(Sym.starts_with(ParamBaseName) && "Invalid symbol reference");
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 420065585b3849..1fb8e3c6565593 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -12,6 +12,8 @@
#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
using namespace llvm;
@@ -69,3 +71,13 @@ bool NVPTXSubtarget::hasImageHandles() const {
bool NVPTXSubtarget::allowFP16Math() const {
return hasFP16Math() && NoF16Math == false;
}
+
+void NVPTXSubtarget::requireClusters(std::string const &FailureMessage) const {
+ if (hasClusters())
+ return;
+
+ report_fatal_error(formatv(
+ "NVPTX SM architecture \"{}\" and PTX version \"{}\" do not support {}. "
+ "Requires SM >= 90 and PTX >= 78.",
+ getFullSmVersion(), PTXVersion, FailureMessage));
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 0591782e8148b9..30be7a9906850c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -78,6 +78,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool hasAtomBitwise64() const { return SmVersion >= 32; }
bool hasAtomMinMax64() const { return SmVersion >= 32; }
bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
+ bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
bool hasLDG() const { return SmVersion >= 32; }
bool hasHWROT32() const { return SmVersion >= 32; }
bool hasImageHandles() const;
@@ -118,6 +119,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+ void requireClusters(std::string const &FailureMessage) const;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index eebd91fefe4f03..77b35e0777c95a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -20,6 +20,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Alignment.h"
+#include "llvm/Support/FormatVariadic.h"
#include <cstdarg>
#include <set>
#include <string>
@@ -86,7 +87,7 @@ bool Isv2x16VT(EVT VT);
namespace NVPTX {
-inline std::string OrderingToCString(Ordering Order) {
+inline std::string OrderingToString(Ordering Order) {
switch (Order) {
case Ordering::NotAtomic:
return "NotAtomic";
@@ -96,7 +97,8 @@ inline std::string OrderingToCString(Ordering Order) {
return "Acquire";
case Ordering::Release:
return "Release";
- // case Ordering::AcquireRelease: return "AcquireRelease";
+ case Ordering::AcquireRelease:
+ return "AcquireRelease";
case Ordering::SequentiallyConsistent:
return "SequentiallyConsistent";
case Ordering::Volatile:
@@ -104,11 +106,34 @@ inline std::string OrderingToCString(Ordering Order) {
case Ordering::RelaxedMMIO:
return "RelaxedMMIO";
}
- report_fatal_error("unknown ordering");
+ report_fatal_error(formatv("Unknown NVPTX::Ordering \"{}\".",
+ static_cast<OrderingUnderlyingType>(Order)));
}
inline raw_ostream &operator<<(raw_ostream &O, Ordering Order) {
- O << OrderingToCString(Order);
+ O << OrderingToString(Order);
+ return O;
+}
+
+inline std::string ScopeToString(Scope S) {
+ switch (S) {
+ case Scope::Thread:
+ return "Thread";
+ case Scope::System:
+ return "System";
+ case Scope::Block:
+ return "Block";
+ case Scope::Cluster:
+ return "Cluster";
+ case Scope::Device:
+ return "Device";
+ }
+ report_fatal_error(formatv("Unknown NVPTX::Scope \"{}\".",
+ static_cast<ScopeUnderlyingType>(S)));
+}
+
+inline raw_ostream &operator<<(raw_ostream &O, Scope S) {
+ O << ScopeToString(S);
return O;
}
diff --git a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
index 58e2e644b000fe..a40b4d85773b29 100644
--- a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
+++ b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
@@ -40,9 +40,9 @@ registers:
- { id: 7, class: float32regs }
body: |
bb.0.entry:
- %0 = LD_f32_avar 0, 4, 1, 2, 32, &test_param_0
+ %0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test_param_0
%1 = CVT_f64_f32 %0, 0
- %2 = LD_i32_avar 0, 4, 1, 0, 32, &test_param_1
+ %2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test_param_1
; CHECK: %3:float64regs = FADD_rnf64ri %1, double 3.250000e+00
%3 = FADD_rnf64ri %1, double 3.250000e+00
%4 = CVT_f32_f64 %3, 5
@@ -66,9 +66,9 @@ registers:
- { id: 7, class: float32regs }
body: |
bb.0.entry:
- %0 = LD_f32_avar 0, 4, 1, 2, 32, &test2_param_0
+ %0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test2_param_0
%1 = CVT_f64_f32 %0, 0
- %2 = LD_i32_avar 0, 4, 1, 0, 32, &test2_param_1
+ %2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test2_param_1
; CHECK: %3:float64regs = FADD_rnf64ri %1, double 0x7FF8000000000000
%3 = FADD_rnf64ri %1, double 0x7FF8000000000000
%4 = CVT_f32_f64 %3, 5
diff --git a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
new file mode 100644
index 00000000000000..82eb5fb71677b6
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+
+; CHECK-LABEL: fence_sc_cluster
+define void @fence_sc_cluster() local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ fence syncscope("cluster") seq_cst
+ ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_cluster
+define void @fence_acq_rel_cluster() local_unnamed_addr {
+ ; CHECK: fence.acq_rel.cluster
+ fence syncscope("cluster") acq_rel
+ ret void
+}
+
+; CHECK-LABEL: fence_release_cluster
+define void @fence_release_cluster() local_unnamed_addr {
+ ; CHECK: fence.acq_rel.cluster
+ fence syncscope("cluster") release
+ ret void
+}
+
+; CHECK-LABEL: fence_acquire_cluster
+define void @fence_acquire_cluster() local_unnamed_addr {
+ ; CHECK: fence.acq_rel.cluster
+ fence syncscope("cluster") acquire
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll
index d3aace95e96650..626685f82f32ca 100644
--- a/llvm/test/CodeGen/NVPTX/fence.ll
+++ b/llvm/test/CodeGen/NVPTX/fence.ll
@@ -3,6 +3,8 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70
; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %}
+; TODO: implement and test thread scope.
+
; CHECK-LABEL: fence_sc_sys
define void @fence_sc_sys() local_unnamed_addr {
; SM60: membar.sys
@@ -16,21 +18,85 @@ define void @fence_acq_rel_sys() local_unnamed_addr {
; SM60: membar.sys
; SM70: fence.acq_rel.sys
fence acq_rel
- ret void
+ ret void
}
; CHECK-LABEL: fence_release_sys
define void @fence_release_sys() local_unnamed_addr {
; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
+ ; SM70: fence.acq_rel.sys
fence release
- ret void
+ ret void
}
; CHECK-LABEL: fence_acquire_sys
define void @fence_acquire_sys() local_unnamed_addr {
; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
+ ; SM70: fence.acq_rel.sys
fence acquire
- ret void
+ ret void
+}
+
+; CHECK-LABEL: fence_sc_gpu
+define void @fence_sc_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.sc.gpu
+ fence syncscope("device") seq_cst
+ ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_gpu
+define void @fence_acq_rel_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.acq_rel.gpu
+ fence syncscope("device") acq_rel
+ ret void
+}
+
+; CHECK-LABEL: fence_release_gpu
+define void @fence_release_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.acq_rel.gpu
+ fence syncscope("device") release
+ ret void
+}
+
+; CHECK-LABEL: fence_acquire_gpu
+define void @fence_acquire_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.acq_rel.gpu
+ fence syncscope("device") acquire
+ ret void
+}
+
+; CHECK-LABEL: fence_sc_cta
+define void @fence_sc_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.sc.cta
+ fence syncscope("block") seq_cst
+ ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_cta
+define void @fence_acq_rel_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.acq_rel.cta
+ fence syncscope("block") acq_rel
+ ret void
+}
+
+; CHECK-LABEL: fence_release_cta
+define void @fence_release_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.acq_rel.cta
+ fence syncscope("block") release
+ ret void
+}
+
+; CHECK-LABEL: fence_acquire_cta
+define void @fence_acquire_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.acq_rel.cta
+ fence syncscope("block") acquire
+ ret void
}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
index 9cea33d12027f2..4b200eacb0cf4a 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -1,10 +1,367 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s
; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
+; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;"
+; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile release": generates "st.release.sys;"
+; but should generate "fence.acq_rel.sys; st.mmio.relaxed.sys;"
+
+; TODO: fix "atomic load volatile seq_cst": generates "fence.sc.sys; ld.acquire.sys;"
+; but should generate "fence.sc.sys; ld.relaxed.mmio.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile seq_cst": generates "fence.sc.sys; st.release.sys;"
+; but should generate "fence.sc.sys; st.relaxed.mmio.sys;"
+
+; TODO: add i1, <8 x i8>, and <6 x i8> vector tests.
+
+; TODO: add test for vectors that exceed 128-bit length
+; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+; TODO: generate PTX that preserves Concurrent Forward Progress
+; for atomic operations to local statespace
+; by generating atomic or volatile operations.
+
+; TODO: design exposure for atomic operations on vector types.
+
+; TODO: implement and test thread scope.
+
+; TODO: add weak,atomic,volatile,atomic volatile tests
+; for .const and .param statespaces.
+
+; TODO: optimize .sys.shared into .cta.shared or .cluster.shared .
+
;; generic statespace
-; CHECK-LABEL: generic_acq_rel
-define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_unordered_gpu
+define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile_gpu
+define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_cta
+define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile_cta
+define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_gpu
+define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_volatile_gpu
+define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_cta
+define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_volatile_cta
+define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_sys
+define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -31,7 +388,7 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr %e release, align 4
@@ -44,8 +401,8 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam
ret void
}
-; CHECK-LABEL: generic_acq_rel_volatile
-define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_acq_rel_volatile_sys
+define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -72,7 +429,7 @@ define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) lo
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr %e release, align 4
@@ -85,8 +442,172 @@ define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) lo
ret void
}
-; CHECK-LABEL: generic_sc
-define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_acq_rel_gpu
+define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_volatile_gpu
+define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_cta
+define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_volatile_cta
+define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_sys
+define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr %a seq_cst, align 1
@@ -122,7 +643,7 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr %e seq_cst, align 4
@@ -138,8 +659,8 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad
ret void
}
-; CHECK-LABEL: generic_sc_volatile
-define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_sc_volatile_sys
+define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr %a seq_cst, align 1
@@ -175,7 +696,7 @@ define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr %e seq_cst, align 4
@@ -191,393 +712,2338 @@ define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
ret void
}
-;; global statespace
-
-; CHECK-LABEL: global_acq_rel
-define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1
+; CHECK-LABEL: generic_sc_gpu
+define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(1) %a release, align 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
- ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b release, align 2
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
- ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c release, align 4
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
- ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d release, align 8
-
- ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e release, align 4
-
- ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e release, align 8
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") seq_cst, align 8
ret void
}
-; CHECK-LABEL: global_acq_rel_volatile
-define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1
+; CHECK-LABEL: generic_sc_volatile_gpu
+define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
- ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
- ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
- ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
- ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") seq_cst, align 4
- ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_cta
+define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") seq_cst, align 8
ret void
}
-; CHECK-LABEL: global_seq_cst
-define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: generic_sc_volatile_cta
+define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") seq_cst, align 1
%a.add = add i8 %a.load, 1
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") seq_cst, align 2
%b.add = add i16 %b.load, 1
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") seq_cst, align 4
%c.add = add i32 %c.load, 1
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") seq_cst, align 8
%d.add = add i64 %d.load, 1
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") seq_cst, align 4
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") seq_cst, align 8
%f.add = fadd double %f.load, 1.
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") seq_cst, align 8
ret void
}
-; CHECK-LABEL: global_seq_cst_volatile
-define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1
+;; global statespace
+
+; CHECK-LABEL: global_unordered_gpu
+define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
+ ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2
+ ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
+ ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4
+ ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
+ ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8
+ ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
+ ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4
+ ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8
+ ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") unordered, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8
+ ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
ret void
}
-;; shared statespace
+; CHECK-LABEL: global_unordered_volatile_gpu
+define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
-; CHECK-LABEL: shared_acq_rel
-define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_unordered_cta
+define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(3) %a release, align 1
+ ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
- ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2
+ ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b release, align 2
+ ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
- ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4
+ ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c release, align 4
+ ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
- ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8
+ ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d release, align 8
+ ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
- ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e release, align 4
+ ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
- ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8
+ ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e release, align 8
+ ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
ret void
}
-; CHECK-LABEL: shared_acq_rel_volatile
-define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1
+; CHECK-LABEL: global_unordered_volatile_cta
+define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
- ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
- ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
- ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
- ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
- ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
ret void
}
-; CHECK-LABEL: shared_seq_cst
-define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1
+; CHECK-LABEL: global_monotonic_gpu
+define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
+ ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2
+ ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
+ ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4
+ ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
+ ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8
+ ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
+ ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4
+ ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8
+ ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
ret void
}
-; CHECK-LABEL: shared_seq_cst_volatile
-define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1
+; CHECK-LABEL: global_monotonic_volatile_gpu
+define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_cta
+define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_volatile_cta
+define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_sys
+define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile_sys
+define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_gpu
+define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile_gpu
+define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_cta
+define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile_cta
+define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_sys
+define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_volatile_sys
+define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_gpu
+define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_volatile_gpu
+define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_cta
+define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_volatile_cta
+define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+;; shared statespace
+
+; CHECK-LABEL: shared_unordered_gpu
+define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile_gpu
+define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_cta
+define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile_cta
+define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_gpu
+define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile_gpu
+define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_cta
+define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile_cta
+define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_sys
+define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile_sys
+define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_gpu
+define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile_gpu
+define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_cta
+define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile_cta
+define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_sys
+define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_volatile_sys
+define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_gpu
+define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_volatile_gpu
+define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_cta
+define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_volatile_cta
+define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+;; local statespace
+
+; CHECK-LABEL: local_unordered_gpu
+define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile_gpu
+define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_cta
+define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile_cta
+define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_gpu
+define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile_gpu
+define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_cta
+define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile_cta
+define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
ret void
}
-;; local statespace
-
-; CHECK-LABEL: local_acq_rel
-define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
-
+; CHECK-LABEL: local_acq_rel_sys
+define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -604,7 +3070,7 @@ define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(5) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(5) %e release, align 4
@@ -617,11 +3083,8 @@ define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
ret void
}
-; CHECK-LABEL: local_acq_rel_volatile
-define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
-
+; CHECK-LABEL: local_acq_rel_volatile_sys
+define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -648,7 +3111,7 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4
@@ -661,11 +3124,172 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
ret void
}
-; CHECK-LABEL: local_seq_cst
-define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
+; CHECK-LABEL: local_acq_rel_gpu
+define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_volatile_gpu
+define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_cta
+define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_volatile_cta
+define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
+
+ ret void
+}
+; CHECK-LABEL: local_seq_cst_sys
+define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
@@ -692,7 +3316,7 @@ define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4
@@ -705,11 +3329,8 @@ define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
ret void
}
-; CHECK-LABEL: local_seq_cst_volatile
-define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
-
+; CHECK-LABEL: local_seq_cst_volatile_sys
+define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
@@ -736,7 +3357,7 @@ define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4
@@ -746,10 +3367,169 @@ define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8
- ; TODO: LLVM IR Verifier does not support atomics on vector types.
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_gpu
+define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_volatile_gpu
+define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_cta
+define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
ret void
}
-; TODO: add plain,atomic,volatile,atomic volatile tests
-; for .const and .param statespaces
\ No newline at end of file
+; CHECK-LABEL: local_seq_cst_volatile_cta
+define void @local_seq_cst_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
new file mode 100644
index 00000000000000..645170da51a011
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
@@ -0,0 +1,1423 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+
+; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;"
+; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile release": generates "st.release.sys;"
+; but should generate "fence.acq_rel.sys; st.mmio.relaxed.sys;"
+
+; TODO: fix "atomic load volatile seq_cst": generates "fence.sc.sys; ld.acquire.sys;"
+; but should generate "fence.sc.sys; ld.relaxed.mmio.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile seq_cst": generates "fence.sc.sys; st.release.sys;"
+; but should generate "fence.sc.sys; st.relaxed.mmio.sys;"
+
+; TODO: add i1, <8 x i8>, and <6 x i8> vector tests.
+
+; TODO: add test for vectors that exceed 128-bit length
+; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+; TODO: generate PTX that preserves Concurrent Forward Progress
+; for atomic operations to local statespace
+; by generating atomic or volatile operations.
+
+; TODO: design exposure for atomic operations on vector types.
+
+; TODO: implement and test thread scope.
+
+; TODO: add weak,atomic,volatile,atomic volatile tests
+; for .const and .param statespaces.
+
+; TODO: optimize .shared.sys into .shared.cta or .shared.cluster .
+
+;; generic statespace
+
+; CHECK-LABEL: generic_unordered_cluster
+define void @generic_unordered_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile_cluster
+define void @generic_unordered_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_cluster
+define void @generic_monotonic_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_volatile_cluster
+define void @generic_monotonic_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_cluster
+define void @generic_acq_rel_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_volatile_cluster
+define void @generic_acq_rel_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_cluster
+define void @generic_sc_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_volatile_cluster
+define void @generic_sc_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+;; global statespace
+
+; CHECK-LABEL: global_unordered_cluster
+define void @global_unordered_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_unordered_volatile_cluster
+define void @global_unordered_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_cluster
+define void @global_monotonic_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_volatile_cluster
+define void @global_monotonic_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_cluster
+define void @global_acq_rel_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile_cluster
+define void @global_acq_rel_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_cluster
+define void @global_seq_cst_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_volatile_cluster
+define void @global_seq_cst_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+;; shared
+
+; CHECK-LABEL: shared_unordered_cluster
+define void @shared_unordered_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile_cluster
+define void @shared_unordered_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_cluster
+define void @shared_monotonic_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile_cluster
+define void @shared_monotonic_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_cluster
+define void @shared_acq_rel_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile_cluster
+define void @shared_acq_rel_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_cluster
+define void @shared_seq_cst_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_volatile_cluster
+define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+;; local statespace
+
+; CHECK-LABEL: local_unordered_cluster
+define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile_cluster
+define void @local_unordered_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_cluster
+define void @local_monotonic_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile_cluster
+define void @local_monotonic_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_cluster
+define void @local_acq_rel_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_volatile_cluster
+define void @local_acq_rel_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_cluster
+define void @local_seq_cst_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_volatile_cluster
+define void @local_seq_cst_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index aac73f71a6766c..f922fd92fa244e 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -9,10 +9,21 @@
; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+; TODO: generate PTX that preserves Concurrent Forward Progress
+; for atomic operations to local statespace
+; by generating atomic or volatile operations.
+
+; TODO: design exposure for atomic operations on vector types.
+
+; TODO: add weak,atomic,volatile,atomic volatile tests
+; for .const and .param statespaces.
+
+; TODO: optimize .sys.shared into .cta.shared or .cluster.shared .
+
; generic statespace
-; CHECK-LABEL: generic_plain
-define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+; CHECK-LABEL: generic_weak
+define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr %a
%a.add = add i8 %a.load, 1
@@ -238,198 +249,198 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr
ret void
}
-; CHECK-LABEL: generic_monotonic
-define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_unordered_sys
+define void @generic_unordered_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr %a monotonic, align 1
+ %a.load = load atomic i8, ptr %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr %a monotonic, align 1
+ store atomic i8 %a.add, ptr %a unordered, align 1
; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr %b monotonic, align 2
+ %b.load = load atomic i16, ptr %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr %b monotonic, align 2
+ store atomic i16 %b.add, ptr %b unordered, align 2
; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr %c monotonic, align 4
+ %c.load = load atomic i32, ptr %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr %c monotonic, align 4
+ store atomic i32 %c.add, ptr %c unordered, align 4
; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr %d monotonic, align 8
+ %d.load = load atomic i64, ptr %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr %d monotonic, align 8
+ store atomic i64 %d.add, ptr %d unordered, align 8
; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr %e monotonic, align 4
+ %e.load = load atomic float, ptr %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr %e monotonic, align 4
+ store atomic float %e.add, ptr %e unordered, align 4
; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr %e monotonic, align 8
+ %f.load = load atomic double, ptr %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr %e monotonic, align 8
+ store atomic double %f.add, ptr %e unordered, align 8
ret void
}
-; CHECK-LABEL: generic_monotonic_volatile
-define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_unordered_volatile_sys
+define void @generic_unordered_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr %a unordered, align 1
; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr %b unordered, align 2
; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr %c unordered, align 4
; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr %d unordered, align 8
; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr %e monotonic, align 4
+ store atomic volatile float %e.add, ptr %e unordered, align 4
; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr %e monotonic, align 8
+ store atomic volatile double %f.add, ptr %e unordered, align 8
ret void
}
-; CHECK-LABEL: generic_unordered
-define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_monotonic_sys
+define void @generic_monotonic_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr %a unordered, align 1
+ %a.load = load atomic i8, ptr %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr %a unordered, align 1
+ store atomic i8 %a.add, ptr %a monotonic, align 1
; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr %b unordered, align 2
+ %b.load = load atomic i16, ptr %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr %b unordered, align 2
+ store atomic i16 %b.add, ptr %b monotonic, align 2
; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr %c unordered, align 4
+ %c.load = load atomic i32, ptr %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr %c unordered, align 4
+ store atomic i32 %c.add, ptr %c monotonic, align 4
; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr %d unordered, align 8
+ %d.load = load atomic i64, ptr %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr %d unordered, align 8
+ store atomic i64 %d.add, ptr %d monotonic, align 8
; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr %e unordered, align 4
+ store atomic float %e.add, ptr %e monotonic, align 4
; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr %e unordered, align 8
+ %f.load = load atomic double, ptr %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr %e unordered, align 8
+ store atomic double %f.add, ptr %e monotonic, align 8
ret void
}
-; CHECK-LABEL: generic_unordered_volatile
-define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_monotonic_volatile_sys
+define void @generic_monotonic_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr %a monotonic, align 1
; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr %b monotonic, align 2
; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr %c monotonic, align 4
; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr %d monotonic, align 8
; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr %e unordered, align 4
+ store atomic volatile float %e.add, ptr %e monotonic, align 4
; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr %e unordered, align 8
+ %f.load = load atomic volatile double, ptr %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr %e unordered, align 8
+ store atomic volatile double %f.add, ptr %e monotonic, align 8
ret void
}
;; global statespace
-; CHECK-LABEL: global_plain
-define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
+; CHECK-LABEL: global_weak
+define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr addrspace(1) %a
%a.add = add i8 %a.load, 1
@@ -630,222 +641,222 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs
ret void
}
-; CHECK-LABEL: global_monotonic
-define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_unordered_sys
+define void @global_unordered_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
+ %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+ store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
+ %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+ store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
+ %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+ store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
+ %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+ store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
+ %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
+ store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
+ %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
+ store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
ret void
}
-; CHECK-LABEL: global_monotonic_volatile
-define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_unordered_volatile_sys
+define void @global_unordered_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
+ store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
+ store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
ret void
}
-; CHECK-LABEL: global_unordered
-define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_monotonic_sys
+define void @global_monotonic_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
+ %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
+ store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
+ %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
+ store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
+ %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
+ store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
+ %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
+ store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
+ store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
+ %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
+ store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
ret void
}
-; CHECK-LABEL: global_unordered_volatile
-define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_monotonic_volatile_sys
+define void @global_monotonic_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
+ store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
+ %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
+ store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
ret void
}
;; shared statespace
-; CHECK-LABEL: shared_plain
-define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
+; CHECK-LABEL: shared_weak
+define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr addrspace(3) %a
%a.add = add i8 %a.load, 1
@@ -1046,202 +1057,198 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs
ret void
}
-; CHECK-LABEL: shared_monotonic
-define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared.
-
+; CHECK-LABEL: shared_unordered_sys
+define void @shared_unordered_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
+ %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
+ %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+ store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
+ %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+ store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
+ %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+ store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
+ %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
+ store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
+ %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
+ store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
ret void
}
-; CHECK-LABEL: shared_monotonic_volatile
-define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+; CHECK-LABEL: shared_unordered_volatile_sys
+define void @shared_unordered_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
+ store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
+ store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
ret void
}
-; CHECK-LABEL: shared_unordered
-define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared.
-
+; CHECK-LABEL: shared_monotonic_sys
+define void @shared_monotonic_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
+ %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
+ store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
+ %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
+ store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
+ %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
+ store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
+ %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
+ store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
+ store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
+ %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
+ store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
ret void
}
-; CHECK-LABEL: shared_unordered_volatile
-define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+; CHECK-LABEL: shared_monotonic_volatile_sys
+define void @shared_monotonic_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
+ store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
+ %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
+ store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
ret void
}
;; local statespace
-; CHECK-LABEL: local_plain
-define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+; CHECK-LABEL: local_weak
+define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr addrspace(5) %a
%a.add = add i8 %a.load, 1
@@ -1343,9 +1350,6 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace
; CHECK-LABEL: local_volatile
define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using volatile operations.
-
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load volatile i8, ptr addrspace(5) %a
%a.add = add i8 %a.load, 1
@@ -1445,175 +1449,166 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp
ret void
}
-; CHECK-LABEL: local_monotonic
-define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
-
+; CHECK-LABEL: local_unordered_sys
+define void @local_unordered_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
+ %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
+ %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+ store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
+ %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+ store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
+ %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+ store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
+ %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
+ store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
+ %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
+ store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
ret void
}
-; CHECK-LABEL: local_monotonic_volatile
-define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by generating atomic or volatile operations
-
+; CHECK-LABEL: local_unordered_volatile_sys
+define void @local_unordered_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
+ store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
+ store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
ret void
}
-; CHECK-LABEL: local_unordered
-define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+; CHECK-LABEL: local_monotonic_sys
+define void @local_monotonic_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
+ %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
+ store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
+ %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
+ store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
+ %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
+ store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
+ %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
+ store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
+ store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
+ %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
+ store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
ret void
}
-; CHECK-LABEL: local_unordered_volatile
-define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+; CHECK-LABEL: local_monotonic_volatile
+define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
+ store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
+ %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
+ store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
ret void
}
-
-; TODO: add plain,atomic,volatile,atomic volatile tests
-; for .const and .param statespaces
\ No newline at end of file
>From 91bebf49372f9d0748caf46b0981a5be9a288523 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Thu, 12 Sep 2024 14:01:07 -0700
Subject: [PATCH 02/15] [NVPTX] NFC: rename sco with scope to match PTX
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 2 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 144 +++++++++---------
2 files changed, 73 insertions(+), 73 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 2a44ce0273ee1b..e8e7d254b7cab9 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -255,7 +255,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
"Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.",
OrderingToString(Ordering)));
}
- } else if (!strcmp(Modifier, "sco")) {
+ } else if (!strcmp(Modifier, "scope")) {
auto S = NVPTX::Scope(Imm);
switch (S) {
case NVPTX::Scope::Thread:
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 963c1de07bab74..510e4b81003119 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2971,39 +2971,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
multiclass LD<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _ari : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _ari_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _asi : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
}
@@ -3019,42 +3019,42 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _ari : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr,
i32imm:$offset),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr,
i32imm:$offset),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _asi : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr,
i32imm:$offset),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
}
@@ -3073,75 +3073,75 @@ let mayStore=1, hasSideEffects=0 in {
multiclass LD_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v4_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
}
let mayLoad=1, hasSideEffects=0 in {
@@ -3156,87 +3156,87 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
imem:$addr),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
Int32Regs:$addr),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
Int64Regs:$addr),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_ari : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
Int32Regs:$addr, i32imm:$offset),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
Int64Regs:$addr, i32imm:$offset),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_asi : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$sco,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
imem:$addr, i32imm:$offset),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v4_avar : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_asi : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$sco, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "st${sem:sem}${sco:sco}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}"
"$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
}
>From cc6881daae9c6f9ad46d9ac09fbc33ad2144b340 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Thu, 12 Sep 2024 14:29:40 -0700
Subject: [PATCH 03/15] [NVPTX] NFC to use StringRef inside printLdStCode
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 18 ++++++++++--------
1 file changed, 10 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index e8e7d254b7cab9..8e062ecdd427c8 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -14,6 +14,7 @@
#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTX.h"
#include "NVPTXUtilities.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -225,11 +226,12 @@ void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
}
void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
- raw_ostream &O, const char *Modifier) {
- if (Modifier) {
+ raw_ostream &O, const char *Modifier_) {
+ if (Modifier_) {
+ llvm::StringRef Modifier(Modifier_);
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int) MO.getImm();
- if (!strcmp(Modifier, "sem")) {
+ if (Modifier == "sem") {
auto Ordering = NVPTX::Ordering(Imm);
switch (Ordering) {
case NVPTX::Ordering::NotAtomic:
@@ -255,7 +257,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
"Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.",
OrderingToString(Ordering)));
}
- } else if (!strcmp(Modifier, "scope")) {
+ } else if (Modifier == "scope") {
auto S = NVPTX::Scope(Imm);
switch (S) {
case NVPTX::Scope::Thread:
@@ -276,7 +278,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
report_fatal_error(formatv(
"NVPTX LdStCode Printer does not support \"{}\" sco modifier.",
ScopeToString(S)));
- } else if (!strcmp(Modifier, "addsp")) {
+ } else if (Modifier == "addsp") {
switch (Imm) {
case NVPTX::PTXLdStInstCode::GLOBAL:
O << ".global";
@@ -298,7 +300,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
default:
llvm_unreachable("Wrong Address Space");
}
- } else if (!strcmp(Modifier, "sign")) {
+ } else if (Modifier == "sign") {
if (Imm == NVPTX::PTXLdStInstCode::Signed)
O << "s";
else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
@@ -309,13 +311,13 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
O << "f";
else
llvm_unreachable("Unknown register type");
- } else if (!strcmp(Modifier, "vec")) {
+ } else if (Modifier == "vec") {
if (Imm == NVPTX::PTXLdStInstCode::V2)
O << ".v2";
else if (Imm == NVPTX::PTXLdStInstCode::V4)
O << ".v4";
} else
- llvm_unreachable("Unknown Modifier");
+ llvm_unreachable(formatv("Unknown Modifier: {}", Modifier).str().c_str());
} else
llvm_unreachable("Empty Modifier");
}
>From 6c692bdd7a706ea94c29187ba8206a53eb77d5b1 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Fri, 13 Sep 2024 08:30:18 -0700
Subject: [PATCH 04/15] [NVPTX] NFC: reformat InstrPrinter LdSt
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 184 +++++++++---------
1 file changed, 92 insertions(+), 92 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 8e062ecdd427c8..3db73b2d41b077 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -226,100 +226,100 @@ void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
}
void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
- raw_ostream &O, const char *Modifier_) {
- if (Modifier_) {
- llvm::StringRef Modifier(Modifier_);
- const MCOperand &MO = MI->getOperand(OpNum);
- int Imm = (int) MO.getImm();
- if (Modifier == "sem") {
- auto Ordering = NVPTX::Ordering(Imm);
- switch (Ordering) {
- case NVPTX::Ordering::NotAtomic:
- return;
- case NVPTX::Ordering::Relaxed:
- O << ".relaxed";
- return;
- case NVPTX::Ordering::Acquire:
- O << ".acquire";
- return;
- case NVPTX::Ordering::Release:
- O << ".release";
- return;
- case NVPTX::Ordering::Volatile:
- O << ".volatile";
- return;
- case NVPTX::Ordering::RelaxedMMIO:
- O << ".mmio.relaxed";
- return;
- default:
- report_fatal_error(formatv(
- "NVPTX LdStCode Printer does not support \"{}\" sem modifier. "
- "Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.",
- OrderingToString(Ordering)));
- }
- } else if (Modifier == "scope") {
- auto S = NVPTX::Scope(Imm);
- switch (S) {
- case NVPTX::Scope::Thread:
- return;
- case NVPTX::Scope::System:
- O << ".sys";
- return;
- case NVPTX::Scope::Block:
- O << ".cta";
- return;
- case NVPTX::Scope::Cluster:
- O << ".cluster";
- return;
- case NVPTX::Scope::Device:
- O << ".gpu";
- return;
- }
+ raw_ostream &O, const char *M) {
+ if (!M)
+ llvm_unreachable("Empty Modifier");
+
+ llvm::StringRef Modifier(M);
+ const MCOperand &MO = MI->getOperand(OpNum);
+ int Imm = (int)MO.getImm();
+ if (Modifier == "sem") {
+ auto Ordering = NVPTX::Ordering(Imm);
+ switch (Ordering) {
+ case NVPTX::Ordering::NotAtomic:
+ return;
+ case NVPTX::Ordering::Relaxed:
+ O << ".relaxed";
+ return;
+ case NVPTX::Ordering::Acquire:
+ O << ".acquire";
+ return;
+ case NVPTX::Ordering::Release:
+ O << ".release";
+ return;
+ case NVPTX::Ordering::Volatile:
+ O << ".volatile";
+ return;
+ case NVPTX::Ordering::RelaxedMMIO:
+ O << ".mmio.relaxed";
+ return;
+ default:
report_fatal_error(formatv(
- "NVPTX LdStCode Printer does not support \"{}\" sco modifier.",
- ScopeToString(S)));
- } else if (Modifier == "addsp") {
- switch (Imm) {
- case NVPTX::PTXLdStInstCode::GLOBAL:
- O << ".global";
- return;
- case NVPTX::PTXLdStInstCode::SHARED:
- O << ".shared";
- return;
- case NVPTX::PTXLdStInstCode::LOCAL:
- O << ".local";
- return;
- case NVPTX::PTXLdStInstCode::PARAM:
- O << ".param";
- return;
- case NVPTX::PTXLdStInstCode::CONSTANT:
- O << ".const";
- return;
- case NVPTX::PTXLdStInstCode::GENERIC:
- return;
- default:
- llvm_unreachable("Wrong Address Space");
- }
- } else if (Modifier == "sign") {
- if (Imm == NVPTX::PTXLdStInstCode::Signed)
- O << "s";
- else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
- O << "u";
- else if (Imm == NVPTX::PTXLdStInstCode::Untyped)
- O << "b";
- else if (Imm == NVPTX::PTXLdStInstCode::Float)
- O << "f";
- else
- llvm_unreachable("Unknown register type");
- } else if (Modifier == "vec") {
- if (Imm == NVPTX::PTXLdStInstCode::V2)
- O << ".v2";
- else if (Imm == NVPTX::PTXLdStInstCode::V4)
- O << ".v4";
- } else
- llvm_unreachable(formatv("Unknown Modifier: {}", Modifier).str().c_str());
+ "NVPTX LdStCode Printer does not support \"{}\" sem modifier. "
+ "Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.",
+ OrderingToString(Ordering)));
+ }
+ } else if (Modifier == "scope") {
+ auto S = NVPTX::Scope(Imm);
+ switch (S) {
+ case NVPTX::Scope::Thread:
+ return;
+ case NVPTX::Scope::System:
+ O << ".sys";
+ return;
+ case NVPTX::Scope::Block:
+ O << ".cta";
+ return;
+ case NVPTX::Scope::Cluster:
+ O << ".cluster";
+ return;
+ case NVPTX::Scope::Device:
+ O << ".gpu";
+ return;
+ }
+ report_fatal_error(
+ formatv("NVPTX LdStCode Printer does not support \"{}\" sco modifier.",
+ ScopeToString(S)));
+ } else if (Modifier == "addsp") {
+ switch (Imm) {
+ case NVPTX::PTXLdStInstCode::GLOBAL:
+ O << ".global";
+ return;
+ case NVPTX::PTXLdStInstCode::SHARED:
+ O << ".shared";
+ return;
+ case NVPTX::PTXLdStInstCode::LOCAL:
+ O << ".local";
+ return;
+ case NVPTX::PTXLdStInstCode::PARAM:
+ O << ".param";
+ return;
+ case NVPTX::PTXLdStInstCode::CONSTANT:
+ O << ".const";
+ return;
+ case NVPTX::PTXLdStInstCode::GENERIC:
+ return;
+ default:
+ llvm_unreachable("Wrong Address Space");
+ }
+ } else if (Modifier == "sign") {
+ if (Imm == NVPTX::PTXLdStInstCode::Signed)
+ O << "s";
+ else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
+ O << "u";
+ else if (Imm == NVPTX::PTXLdStInstCode::Untyped)
+ O << "b";
+ else if (Imm == NVPTX::PTXLdStInstCode::Float)
+ O << "f";
+ else
+ llvm_unreachable("Unknown register type");
+ } else if (Modifier == "vec") {
+ if (Imm == NVPTX::PTXLdStInstCode::V2)
+ O << ".v2";
+ else if (Imm == NVPTX::PTXLdStInstCode::V4)
+ O << ".v4";
} else
- llvm_unreachable("Empty Modifier");
+ llvm_unreachable(formatv("Unknown Modifier: {}", Modifier).str().c_str());
}
void NVPTXInstPrinter::printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O,
>From 25dbb0771e50858e7a8636a8e7679e3274f2c617 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Fri, 13 Sep 2024 09:24:26 -0700
Subject: [PATCH 05/15] [NVPTX]: NFC: replace strcmp with StringRef
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 149 ++++++++++--------
1 file changed, 83 insertions(+), 66 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 3db73b2d41b077..3ba861a68d63e1 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -96,140 +96,141 @@ void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
}
void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
- const char *Modifier) {
+ const char *M) {
const MCOperand &MO = MI->getOperand(OpNum);
int64_t Imm = MO.getImm();
+ llvm::StringRef Modifier(M);
- if (strcmp(Modifier, "ftz") == 0) {
+ if (Modifier == "ftz") {
// FTZ flag
if (Imm & NVPTX::PTXCvtMode::FTZ_FLAG)
O << ".ftz";
- } else if (strcmp(Modifier, "sat") == 0) {
+ return;
+ } else if (Modifier == "sat") {
// SAT flag
if (Imm & NVPTX::PTXCvtMode::SAT_FLAG)
O << ".sat";
- } else if (strcmp(Modifier, "relu") == 0) {
+ return;
+ } else if (Modifier == "relu") {
// RELU flag
if (Imm & NVPTX::PTXCvtMode::RELU_FLAG)
O << ".relu";
- } else if (strcmp(Modifier, "base") == 0) {
+ return;
+ } else if (Modifier == "base") {
// Default operand
switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) {
default:
return;
case NVPTX::PTXCvtMode::NONE:
- break;
+ return;
case NVPTX::PTXCvtMode::RNI:
O << ".rni";
- break;
+ return;
case NVPTX::PTXCvtMode::RZI:
O << ".rzi";
- break;
+ return;
case NVPTX::PTXCvtMode::RMI:
O << ".rmi";
- break;
+ return;
case NVPTX::PTXCvtMode::RPI:
O << ".rpi";
- break;
+ return;
case NVPTX::PTXCvtMode::RN:
O << ".rn";
- break;
+ return;
case NVPTX::PTXCvtMode::RZ:
O << ".rz";
- break;
+ return;
case NVPTX::PTXCvtMode::RM:
O << ".rm";
- break;
+ return;
case NVPTX::PTXCvtMode::RP:
O << ".rp";
- break;
+ return;
case NVPTX::PTXCvtMode::RNA:
O << ".rna";
- break;
+ return;
}
- } else {
- llvm_unreachable("Invalid conversion modifier");
}
+ llvm_unreachable("Invalid conversion modifier");
}
void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
- const char *Modifier) {
+ const char *M) {
const MCOperand &MO = MI->getOperand(OpNum);
int64_t Imm = MO.getImm();
+ llvm::StringRef Modifier(M);
- if (strcmp(Modifier, "ftz") == 0) {
+ if (Modifier == "ftz") {
// FTZ flag
if (Imm & NVPTX::PTXCmpMode::FTZ_FLAG)
O << ".ftz";
- } else if (strcmp(Modifier, "base") == 0) {
+ return;
+ } else if (Modifier == "base") {
switch (Imm & NVPTX::PTXCmpMode::BASE_MASK) {
default:
return;
case NVPTX::PTXCmpMode::EQ:
O << ".eq";
- break;
+ return;
case NVPTX::PTXCmpMode::NE:
O << ".ne";
- break;
+ return;
case NVPTX::PTXCmpMode::LT:
O << ".lt";
- break;
+ return;
case NVPTX::PTXCmpMode::LE:
O << ".le";
- break;
+ return;
case NVPTX::PTXCmpMode::GT:
O << ".gt";
- break;
+ return;
case NVPTX::PTXCmpMode::GE:
O << ".ge";
- break;
+ return;
case NVPTX::PTXCmpMode::LO:
O << ".lo";
- break;
+ return;
case NVPTX::PTXCmpMode::LS:
O << ".ls";
- break;
+ return;
case NVPTX::PTXCmpMode::HI:
O << ".hi";
- break;
+ return;
case NVPTX::PTXCmpMode::HS:
O << ".hs";
- break;
+ return;
case NVPTX::PTXCmpMode::EQU:
O << ".equ";
- break;
+ return;
case NVPTX::PTXCmpMode::NEU:
O << ".neu";
- break;
+ return;
case NVPTX::PTXCmpMode::LTU:
O << ".ltu";
- break;
+ return;
case NVPTX::PTXCmpMode::LEU:
O << ".leu";
- break;
+ return;
case NVPTX::PTXCmpMode::GTU:
O << ".gtu";
- break;
+ return;
case NVPTX::PTXCmpMode::GEU:
O << ".geu";
- break;
+ return;
case NVPTX::PTXCmpMode::NUM:
O << ".num";
- break;
+ return;
case NVPTX::PTXCmpMode::NotANumber:
O << ".nan";
- break;
+ return;
}
- } else {
- llvm_unreachable("Empty Modifier");
}
+ llvm_unreachable("Empty Modifier");
}
void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
raw_ostream &O, const char *M) {
- if (!M)
- llvm_unreachable("Empty Modifier");
-
llvm::StringRef Modifier(M);
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int)MO.getImm();
@@ -303,44 +304,60 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
llvm_unreachable("Wrong Address Space");
}
} else if (Modifier == "sign") {
- if (Imm == NVPTX::PTXLdStInstCode::Signed)
+ switch (Imm) {
+ case NVPTX::PTXLdStInstCode::Signed:
O << "s";
- else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
+ return;
+ case NVPTX::PTXLdStInstCode::Unsigned:
O << "u";
- else if (Imm == NVPTX::PTXLdStInstCode::Untyped)
+ return;
+ case NVPTX::PTXLdStInstCode::Untyped:
O << "b";
- else if (Imm == NVPTX::PTXLdStInstCode::Float)
+ return;
+ case NVPTX::PTXLdStInstCode::Float:
O << "f";
- else
+ return;
+ default:
llvm_unreachable("Unknown register type");
+ }
} else if (Modifier == "vec") {
- if (Imm == NVPTX::PTXLdStInstCode::V2)
+ switch (Imm) {
+ case NVPTX::PTXLdStInstCode::V2:
O << ".v2";
- else if (Imm == NVPTX::PTXLdStInstCode::V4)
+ return;
+ case NVPTX::PTXLdStInstCode::V4:
O << ".v4";
- } else
- llvm_unreachable(formatv("Unknown Modifier: {}", Modifier).str().c_str());
+ return;
+ }
+ // TODO: evaluate whether cases not covered by this switch are bugs
+ return;
+ }
+ llvm_unreachable(formatv("Unknown Modifier: {}", Modifier).str().c_str());
}
void NVPTXInstPrinter::printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O,
- const char *Modifier) {
+ const char *M) {
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int)MO.getImm();
- if (Modifier == nullptr || strcmp(Modifier, "version") == 0) {
+ llvm::StringRef Modifier(M);
+ if (Modifier.empty() || Modifier == "version") {
O << Imm; // Just print out PTX version
- } else if (strcmp(Modifier, "aligned") == 0) {
+ return;
+ } else if (Modifier == "aligned") {
// PTX63 requires '.aligned' in the name of the instruction.
if (Imm >= 63)
O << ".aligned";
- } else
- llvm_unreachable("Unknown Modifier");
+ return;
+ }
+ llvm_unreachable("Unknown Modifier");
}
void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum,
- raw_ostream &O, const char *Modifier) {
+ raw_ostream &O, const char *M) {
printOperand(MI, OpNum, O);
+ llvm::StringRef Modifier(M);
- if (Modifier && !strcmp(Modifier, "add")) {
+ if (Modifier == "add") {
O << ", ";
printOperand(MI, OpNum + 1, O);
} else {
@@ -370,24 +387,24 @@ void NVPTXInstPrinter::printPrmtMode(const MCInst *MI, int OpNum,
default:
return;
case NVPTX::PTXPrmtMode::NONE:
- break;
+ return;
case NVPTX::PTXPrmtMode::F4E:
O << ".f4e";
- break;
+ return;
case NVPTX::PTXPrmtMode::B4E:
O << ".b4e";
- break;
+ return;
case NVPTX::PTXPrmtMode::RC8:
O << ".rc8";
- break;
+ return;
case NVPTX::PTXPrmtMode::ECL:
O << ".ecl";
- break;
+ return;
case NVPTX::PTXPrmtMode::ECR:
O << ".ecr";
- break;
+ return;
case NVPTX::PTXPrmtMode::RC16:
O << ".rc16";
- break;
+ return;
}
}
>From e54284fbb6b7b15b15c2065f6ba912bdb9fbfa16 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Fri, 13 Sep 2024 10:08:33 -0700
Subject: [PATCH 06/15] [NVPTX] Add AddressSpace Utilities and refactor
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 30 ++++++--------
llvm/lib/Target/NVPTX/NVPTX.h | 20 ++++++----
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 40 +++++++++++--------
llvm/lib/Target/NVPTX/NVPTXUtilities.h | 24 +++++++++++
4 files changed, 71 insertions(+), 43 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 3ba861a68d63e1..7d6442a611125f 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -282,27 +282,21 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
formatv("NVPTX LdStCode Printer does not support \"{}\" sco modifier.",
ScopeToString(S)));
} else if (Modifier == "addsp") {
- switch (Imm) {
- case NVPTX::PTXLdStInstCode::GLOBAL:
- O << ".global";
- return;
- case NVPTX::PTXLdStInstCode::SHARED:
- O << ".shared";
- return;
- case NVPTX::PTXLdStInstCode::LOCAL:
- O << ".local";
+ auto A = NVPTX::AddressSpace(Imm);
+ switch (A) {
+ case NVPTX::AddressSpace::Generic:
return;
- case NVPTX::PTXLdStInstCode::PARAM:
- O << ".param";
+ case NVPTX::AddressSpace::Global:
+ case NVPTX::AddressSpace::Const:
+ case NVPTX::AddressSpace::Shared:
+ case NVPTX::AddressSpace::Param:
+ case NVPTX::AddressSpace::Local:
+ O << "." << A;
return;
- case NVPTX::PTXLdStInstCode::CONSTANT:
- O << ".const";
- return;
- case NVPTX::PTXLdStInstCode::GENERIC:
- return;
- default:
- llvm_unreachable("Wrong Address Space");
}
+ report_fatal_error(formatv(
+ "NVPTX LdStCode Printer does not support \"{}\" addsp modifier.",
+ AddressSpaceToString(A)));
} else if (Modifier == "sign") {
switch (Imm) {
case NVPTX::PTXLdStInstCode::Signed:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index b5624f9212ea27..5dbe6673ab8c3c 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -135,15 +135,19 @@ enum Scope : ScopeUnderlyingType {
LASTSCOPE = Device
};
-namespace PTXLdStInstCode {
-enum AddressSpace {
- GENERIC = 0,
- GLOBAL = 1,
- CONSTANT = 2,
- SHARED = 3,
- PARAM = 4,
- LOCAL = 5
+using AddressSpaceUnderlyingType = unsigned int;
+enum AddressSpace : AddressSpaceUnderlyingType {
+ Generic = 0,
+ Global = 1,
+ Shared = 3,
+ Const = 4,
+ Local = 5,
+
+ // NVPTX Backend Private:
+ Param = 101
};
+
+namespace PTXLdStInstCode {
enum FromType {
Unsigned = 0,
Signed,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index f04796fcdd49fe..ad22cb3315dd46 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -704,20 +704,26 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
const Value *Src = N->getMemOperand()->getValue();
if (!Src)
- return NVPTX::PTXLdStInstCode::GENERIC;
+ return NVPTX::AddressSpace::Generic;
if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
switch (PT->getAddressSpace()) {
- case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
- case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
- case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
- case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
- case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
- case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
+ case llvm::ADDRESS_SPACE_LOCAL:
+ return NVPTX::AddressSpace::Local;
+ case llvm::ADDRESS_SPACE_GLOBAL:
+ return NVPTX::AddressSpace::Global;
+ case llvm::ADDRESS_SPACE_SHARED:
+ return NVPTX::AddressSpace::Shared;
+ case llvm::ADDRESS_SPACE_GENERIC:
+ return NVPTX::AddressSpace::Generic;
+ case llvm::ADDRESS_SPACE_PARAM:
+ return NVPTX::AddressSpace::Param;
+ case llvm::ADDRESS_SPACE_CONST:
+ return NVPTX::AddressSpace::Const;
default: break;
}
}
- return NVPTX::PTXLdStInstCode::GENERIC;
+ return NVPTX::AddressSpace::Generic;
}
namespace {
@@ -820,9 +826,9 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
// - the "weak" memory instruction we are currently lowering to, and
// - some other instruction that preserves the side-effect, e.g.,
// a dead dummy volatile load.
- if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
+ if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
+ CodeAddrSpace == NVPTX::AddressSpace::Const ||
+ CodeAddrSpace == NVPTX::AddressSpace::Param) {
return NVPTX::Ordering::NotAtomic;
}
@@ -847,14 +853,14 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
// atomics is undefined if the generic address does not refer to a .global or
// .shared memory location.
bool AddrGenericOrGlobalOrShared =
- (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
+ (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
+ CodeAddrSpace == NVPTX::AddressSpace::Global ||
+ CodeAddrSpace == NVPTX::AddressSpace::Shared);
if (!AddrGenericOrGlobalOrShared)
return NVPTX::Ordering::NotAtomic;
bool UseRelaxedMMIO =
- HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
+ HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
switch (Ordering) {
case AtomicOrdering::NotAtomic:
@@ -975,7 +981,7 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
// TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
// explicitly invariant loads because these are how clang tells us to use ldg
// when the user uses a builtin.
- if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
+ if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::AddressSpace::Global)
return false;
if (N->isInvariant())
@@ -2090,7 +2096,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
// Address Space Setting
unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
- if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
+ if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
report_fatal_error("Cannot store to pointer that points to constant "
"memory space");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 77b35e0777c95a..938b9b04b7a449 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -137,6 +137,30 @@ inline raw_ostream &operator<<(raw_ostream &O, Scope S) {
return O;
}
+inline std::string AddressSpaceToString(AddressSpace A) {
+ switch (A) {
+ case AddressSpace::Generic:
+ return "generic";
+ case AddressSpace::Global:
+ return "global";
+ case AddressSpace::Const:
+ return "const";
+ case AddressSpace::Shared:
+ return "shared";
+ case AddressSpace::Param:
+ return "param";
+ case AddressSpace::Local:
+ return "local";
+ }
+ report_fatal_error(formatv("Unknown NVPTX::AddressSpace \"{}\".",
+ static_cast<AddressSpaceUnderlyingType>(A)));
+}
+
+inline raw_ostream &operator<<(raw_ostream &O, AddressSpace A) {
+ O << AddressSpaceToString(A);
+ return O;
+}
+
} // namespace NVPTX
} // namespace llvm
>From a55a2d3d4b798fb81d8129b3c9afbfffc022f048 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Fri, 13 Sep 2024 11:17:45 -0700
Subject: [PATCH 07/15] [NVPTX] use llvm_unreachable for scope map
initialization check
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ad22cb3315dd46..396b8684350fc8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -4187,8 +4187,8 @@ NVPTXScopes::NVPTXScopes(LLVMContext &C) : CTX(&C) {
NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
if (Scopes.empty())
- report_fatal_error("NVPTX Scopes must be initialized before calling "
- "NVPTXScopes::operator[]");
+ llvm_unreachable("NVPTX Scopes must be initialized before calling "
+ "NVPTXScopes::operator[]");
auto S = Scopes.find(ID);
if (S == Scopes.end()) {
>From f58d62bd62e11ef30f17735673d7bb83dd8192ce Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Fri, 13 Sep 2024 11:24:20 -0700
Subject: [PATCH 08/15] [NVPTX] NFC: add comment for insert instruction fence
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 6 +++---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 3 +++
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 396b8684350fc8..458b30ad84b4a0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1059,12 +1059,12 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
}
}
+// Returns Memory Order and Scope of a memory instruction, and
+// inserts any fence before the instruction that's required to
+// implement its memory ordering.
std::pair<NVPTX::Ordering, NVPTX::Scope>
NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
MemSDNode *N) {
- // Some memory instructions - loads, stores, atomics - need an extra fence
- // instruction. Get the memory order of the instruction, and that of its
- // fence, if any.
auto [InstructionOrdering, FenceOrdering] =
getOperationOrderings(N, Subtarget);
auto Scope = getOperationScope(N, InstructionOrdering);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 7eccf9e45314b1..de2609a7a83df6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -116,6 +116,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, LoadSDNode *N);
+ // Returns Memory Order and Scope of a memory instruction, and
+ // inserts any fence before the instruction that's required to
+ // implement its memory ordering.
std::pair<NVPTX::Ordering, NVPTX::Scope>
insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, MemSDNode *N);
NVPTX::Scope getOperationScope(MemSDNode *N, NVPTX::Ordering O) const;
>From 60763720fcad3ab1112da86a3ea72acf02a9b6c0 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Fri, 13 Sep 2024 11:33:01 -0700
Subject: [PATCH 09/15] [NVPTX] NFC: rename requiresClusters
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 4 ++--
llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp | 3 ++-
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 2 +-
3 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 458b30ad84b4a0..45880ff06d0afd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -952,7 +952,7 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
// If scope is cluster, clusters must be supported.
if (S == NVPTX::Scope::Cluster)
- Subtarget->requireClusters("cluster scope");
+ Subtarget->failIfClustersUnsupported("cluster scope");
// If operation is volatile, then its scope is system.
if (N->isVolatile())
@@ -1007,7 +1007,7 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
NVPTXSubtarget const *T) {
if (S == NVPTX::Scope::Cluster)
- T->requireClusters(".cluster scope fence");
+ T->failIfClustersUnsupported(".cluster scope fence");
switch (O) {
case NVPTX::Ordering::Acquire:
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 1fb8e3c6565593..0e6b75e622c6ad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -72,7 +72,8 @@ bool NVPTXSubtarget::allowFP16Math() const {
return hasFP16Math() && NoF16Math == false;
}
-void NVPTXSubtarget::requireClusters(std::string const &FailureMessage) const {
+void NVPTXSubtarget::failIfClustersUnsupported(
+ std::string const &FailureMessage) const {
if (hasClusters())
return;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 30be7a9906850c..7b032876c87220 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -120,7 +120,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
- void requireClusters(std::string const &FailureMessage) const;
+ void failIfClustersUnsupported(std::string const &FailureMessage) const;
};
} // End llvm namespace
>From 393bdd635ad28ee51b5e845a6b5e82e9bbe52457 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Fri, 13 Sep 2024 11:58:39 -0700
Subject: [PATCH 10/15] [NVPTX] Update Sync Scope ordering
---
llvm/lib/Target/NVPTX/NVPTX.h | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 5dbe6673ab8c3c..f6ab81d3ca0bb2 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -128,11 +128,11 @@ enum Ordering : OrderingUnderlyingType {
using ScopeUnderlyingType = unsigned int;
enum Scope : ScopeUnderlyingType {
Thread = 0,
- System = 1,
- Block = 2,
- Cluster = 3,
- Device = 4,
- LASTSCOPE = Device
+ Block = 1,
+ Cluster = 2,
+ Device = 3,
+ System = 4,
+ LASTSCOPE = System
};
using AddressSpaceUnderlyingType = unsigned int;
>From c702fb1a8772ef40b296cedb2b1302459d352b3b Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Fri, 13 Sep 2024 13:47:49 -0700
Subject: [PATCH 11/15] [NVPTX] NFC: remove make pair and early return
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 45880ff06d0afd..bf562c175cb353 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -955,10 +955,7 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
Subtarget->failIfClustersUnsupported("cluster scope");
// If operation is volatile, then its scope is system.
- if (N->isVolatile())
- S = NVPTX::Scope::System;
-
- return S;
+ return N->isVolatile() ? NVPTX::Scope::System : S;
}
}
@@ -1083,7 +1080,7 @@ NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
formatv("Unexpected fence ordering: \"{}\".",
OrderingToString(NVPTX::Ordering(FenceOrdering))));
}
- return std::make_pair(InstructionOrdering, Scope);
+ return {InstructionOrdering, Scope};
}
bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
>From 4494dc377d08b73bc1b9f8e85c314f695248faef Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Fri, 13 Sep 2024 13:55:12 -0700
Subject: [PATCH 12/15] [NVPTX]: rename Ord to O
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index bf562c175cb353..fc926801b43bb2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -926,9 +926,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
} // namespace
-NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
- NVPTX::Ordering Ord) const {
- switch (Ord) {
+NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N, NVPTX::Ordering O) const {
+ switch (O) {
case NVPTX::Ordering::NotAtomic:
case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
// NVPTX uses Thread scope as the scope of non-atomic operations.
>From ad6465bba1676d1af664b6bfe8d2479c330fbdba Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Fri, 13 Sep 2024 14:51:26 -0700
Subject: [PATCH 13/15] [NVPTX]: fix formatting
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index fc926801b43bb2..379fd8e16ee3b1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -926,7 +926,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
} // namespace
-NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N, NVPTX::Ordering O) const {
+NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
+ NVPTX::Ordering O) const {
switch (O) {
case NVPTX::Ordering::NotAtomic:
case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
>From b760feb36a9ecae702a8bf27607884b099993134 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Wed, 18 Sep 2024 04:01:37 -0700
Subject: [PATCH 14/15] [NVPTX] NFC: improve comment of
insertMemoryInstructionFence
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index de2609a7a83df6..f925fc67fbccb7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -116,9 +116,10 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, LoadSDNode *N);
- // Returns Memory Order and Scope of a memory instruction, and
- // inserts any fence before the instruction that's required to
- // implement its memory ordering.
+ // Returns the Memory Order and Scope that the PTX memory instruction should
+ // use, and inserts appropriate fence instruction before the memory
+ // instruction, if needed to implement the instructions memory order. Required
+ // fences after the instruction need to be handled elsewhere.
std::pair<NVPTX::Ordering, NVPTX::Scope>
insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, MemSDNode *N);
NVPTX::Scope getOperationScope(MemSDNode *N, NVPTX::Ordering O) const;
>From 629a0561c1e1d5b975d6c3e647ecea4018dd29e1 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Thu, 19 Sep 2024 07:33:12 -0700
Subject: [PATCH 15/15] [NVPTX]: add few llvm_unreachable
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 379fd8e16ee3b1..19e626868359fd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -957,6 +957,7 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
// If operation is volatile, then its scope is system.
return N->isVolatile() ? NVPTX::Scope::System : S;
}
+ llvm_unreachable("unhandled ordering");
}
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
@@ -1054,6 +1055,7 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.",
OrderingToString(O), ScopeToString(S)));
}
+ llvm_unreachable("unhandled ordering");
}
// Returns Memory Order and Scope of a memory instruction, and
More information about the llvm-commits
mailing list