[llvm] 78ae2de - [NVPTX] Load/Store/Fence syncscope support (#106101)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 23 10:18:05 PDT 2024
Author: gonzalobg
Date: 2024-09-23T10:18:00-07:00
New Revision: 78ae2de4c692bea03d03e4c149b350543d220490
URL: https://github.com/llvm/llvm-project/commit/78ae2de4c692bea03d03e4c149b350543d220490
DIFF: https://github.com/llvm/llvm-project/commit/78ae2de4c692bea03d03e4c149b350543d220490.diff
LOG: [NVPTX] Load/Store/Fence syncscope support (#106101)
Adds "initial" support for `syncscope` to the NVPTX backend
`load`/`store`/`fence` instructions.
Atomic Read-Modify-Write operations intentionally not supported as part
of this initial PR.
Added:
llvm/test/CodeGen/NVPTX/fence-sm-90.ll
llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
Modified:
llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
llvm/lib/Target/NVPTX/NVPTX.h
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
llvm/lib/Target/NVPTX/NVPTXSubtarget.h
llvm/lib/Target/NVPTX/NVPTXUtilities.h
llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
llvm/test/CodeGen/NVPTX/fence.ll
llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
llvm/test/CodeGen/NVPTX/load-store.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 5b568b0487b45a..7d6442a611125f 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -14,6 +14,7 @@
#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTX.h"
#include "NVPTXUtilities.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -95,228 +96,262 @@ void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
}
void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
- const char *Modifier) {
+ const char *M) {
const MCOperand &MO = MI->getOperand(OpNum);
int64_t Imm = MO.getImm();
+ llvm::StringRef Modifier(M);
- if (strcmp(Modifier, "ftz") == 0) {
+ if (Modifier == "ftz") {
// FTZ flag
if (Imm & NVPTX::PTXCvtMode::FTZ_FLAG)
O << ".ftz";
- } else if (strcmp(Modifier, "sat") == 0) {
+ return;
+ } else if (Modifier == "sat") {
// SAT flag
if (Imm & NVPTX::PTXCvtMode::SAT_FLAG)
O << ".sat";
- } else if (strcmp(Modifier, "relu") == 0) {
+ return;
+ } else if (Modifier == "relu") {
// RELU flag
if (Imm & NVPTX::PTXCvtMode::RELU_FLAG)
O << ".relu";
- } else if (strcmp(Modifier, "base") == 0) {
+ return;
+ } else if (Modifier == "base") {
// Default operand
switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) {
default:
return;
case NVPTX::PTXCvtMode::NONE:
- break;
+ return;
case NVPTX::PTXCvtMode::RNI:
O << ".rni";
- break;
+ return;
case NVPTX::PTXCvtMode::RZI:
O << ".rzi";
- break;
+ return;
case NVPTX::PTXCvtMode::RMI:
O << ".rmi";
- break;
+ return;
case NVPTX::PTXCvtMode::RPI:
O << ".rpi";
- break;
+ return;
case NVPTX::PTXCvtMode::RN:
O << ".rn";
- break;
+ return;
case NVPTX::PTXCvtMode::RZ:
O << ".rz";
- break;
+ return;
case NVPTX::PTXCvtMode::RM:
O << ".rm";
- break;
+ return;
case NVPTX::PTXCvtMode::RP:
O << ".rp";
- break;
+ return;
case NVPTX::PTXCvtMode::RNA:
O << ".rna";
- break;
+ return;
}
- } else {
- llvm_unreachable("Invalid conversion modifier");
}
+ llvm_unreachable("Invalid conversion modifier");
}
void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
- const char *Modifier) {
+ const char *M) {
const MCOperand &MO = MI->getOperand(OpNum);
int64_t Imm = MO.getImm();
+ llvm::StringRef Modifier(M);
- if (strcmp(Modifier, "ftz") == 0) {
+ if (Modifier == "ftz") {
// FTZ flag
if (Imm & NVPTX::PTXCmpMode::FTZ_FLAG)
O << ".ftz";
- } else if (strcmp(Modifier, "base") == 0) {
+ return;
+ } else if (Modifier == "base") {
switch (Imm & NVPTX::PTXCmpMode::BASE_MASK) {
default:
return;
case NVPTX::PTXCmpMode::EQ:
O << ".eq";
- break;
+ return;
case NVPTX::PTXCmpMode::NE:
O << ".ne";
- break;
+ return;
case NVPTX::PTXCmpMode::LT:
O << ".lt";
- break;
+ return;
case NVPTX::PTXCmpMode::LE:
O << ".le";
- break;
+ return;
case NVPTX::PTXCmpMode::GT:
O << ".gt";
- break;
+ return;
case NVPTX::PTXCmpMode::GE:
O << ".ge";
- break;
+ return;
case NVPTX::PTXCmpMode::LO:
O << ".lo";
- break;
+ return;
case NVPTX::PTXCmpMode::LS:
O << ".ls";
- break;
+ return;
case NVPTX::PTXCmpMode::HI:
O << ".hi";
- break;
+ return;
case NVPTX::PTXCmpMode::HS:
O << ".hs";
- break;
+ return;
case NVPTX::PTXCmpMode::EQU:
O << ".equ";
- break;
+ return;
case NVPTX::PTXCmpMode::NEU:
O << ".neu";
- break;
+ return;
case NVPTX::PTXCmpMode::LTU:
O << ".ltu";
- break;
+ return;
case NVPTX::PTXCmpMode::LEU:
O << ".leu";
- break;
+ return;
case NVPTX::PTXCmpMode::GTU:
O << ".gtu";
- break;
+ return;
case NVPTX::PTXCmpMode::GEU:
O << ".geu";
- break;
+ return;
case NVPTX::PTXCmpMode::NUM:
O << ".num";
- break;
+ return;
case NVPTX::PTXCmpMode::NotANumber:
O << ".nan";
- break;
+ return;
}
- } else {
- llvm_unreachable("Empty Modifier");
}
+ llvm_unreachable("Empty Modifier");
}
void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
- raw_ostream &O, const char *Modifier) {
- if (Modifier) {
- const MCOperand &MO = MI->getOperand(OpNum);
- int Imm = (int) MO.getImm();
- if (!strcmp(Modifier, "sem")) {
- auto Ordering = NVPTX::Ordering(Imm);
- switch (Ordering) {
- case NVPTX::Ordering::NotAtomic:
- break;
- case NVPTX::Ordering::Volatile:
- O << ".volatile";
- break;
- case NVPTX::Ordering::Relaxed:
- O << ".relaxed.sys";
- break;
- case NVPTX::Ordering::Acquire:
- O << ".acquire.sys";
- break;
- case NVPTX::Ordering::Release:
- O << ".release.sys";
- break;
- case NVPTX::Ordering::RelaxedMMIO:
- O << ".mmio.relaxed.sys";
- break;
- default:
- report_fatal_error(formatv(
- "NVPTX LdStCode Printer does not support \"{}\" sem modifier.",
- OrderingToCString(Ordering)));
- }
- } else if (!strcmp(Modifier, "addsp")) {
- switch (Imm) {
- case NVPTX::PTXLdStInstCode::GLOBAL:
- O << ".global";
- break;
- case NVPTX::PTXLdStInstCode::SHARED:
- O << ".shared";
- break;
- case NVPTX::PTXLdStInstCode::LOCAL:
- O << ".local";
- break;
- case NVPTX::PTXLdStInstCode::PARAM:
- O << ".param";
- break;
- case NVPTX::PTXLdStInstCode::CONSTANT:
- O << ".const";
- break;
- case NVPTX::PTXLdStInstCode::GENERIC:
- break;
- default:
- llvm_unreachable("Wrong Address Space");
- }
- } else if (!strcmp(Modifier, "sign")) {
- if (Imm == NVPTX::PTXLdStInstCode::Signed)
- O << "s";
- else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
- O << "u";
- else if (Imm == NVPTX::PTXLdStInstCode::Untyped)
- O << "b";
- else if (Imm == NVPTX::PTXLdStInstCode::Float)
- O << "f";
- else
- llvm_unreachable("Unknown register type");
- } else if (!strcmp(Modifier, "vec")) {
- if (Imm == NVPTX::PTXLdStInstCode::V2)
- O << ".v2";
- else if (Imm == NVPTX::PTXLdStInstCode::V4)
- O << ".v4";
- } else
- llvm_unreachable("Unknown Modifier");
- } else
- llvm_unreachable("Empty Modifier");
+ raw_ostream &O, const char *M) {
+ llvm::StringRef Modifier(M);
+ const MCOperand &MO = MI->getOperand(OpNum);
+ int Imm = (int)MO.getImm();
+ if (Modifier == "sem") {
+ auto Ordering = NVPTX::Ordering(Imm);
+ switch (Ordering) {
+ case NVPTX::Ordering::NotAtomic:
+ return;
+ case NVPTX::Ordering::Relaxed:
+ O << ".relaxed";
+ return;
+ case NVPTX::Ordering::Acquire:
+ O << ".acquire";
+ return;
+ case NVPTX::Ordering::Release:
+ O << ".release";
+ return;
+ case NVPTX::Ordering::Volatile:
+ O << ".volatile";
+ return;
+ case NVPTX::Ordering::RelaxedMMIO:
+ O << ".mmio.relaxed";
+ return;
+ default:
+ report_fatal_error(formatv(
+ "NVPTX LdStCode Printer does not support \"{}\" sem modifier. "
+ "Loads/Stores cannot be AcquireRelease or SequentiallyConsistent.",
+ OrderingToString(Ordering)));
+ }
+ } else if (Modifier == "scope") {
+ auto S = NVPTX::Scope(Imm);
+ switch (S) {
+ case NVPTX::Scope::Thread:
+ return;
+ case NVPTX::Scope::System:
+ O << ".sys";
+ return;
+ case NVPTX::Scope::Block:
+ O << ".cta";
+ return;
+ case NVPTX::Scope::Cluster:
+ O << ".cluster";
+ return;
+ case NVPTX::Scope::Device:
+ O << ".gpu";
+ return;
+ }
+ report_fatal_error(
+ formatv("NVPTX LdStCode Printer does not support \"{}\" sco modifier.",
+ ScopeToString(S)));
+ } else if (Modifier == "addsp") {
+ auto A = NVPTX::AddressSpace(Imm);
+ switch (A) {
+ case NVPTX::AddressSpace::Generic:
+ return;
+ case NVPTX::AddressSpace::Global:
+ case NVPTX::AddressSpace::Const:
+ case NVPTX::AddressSpace::Shared:
+ case NVPTX::AddressSpace::Param:
+ case NVPTX::AddressSpace::Local:
+ O << "." << A;
+ return;
+ }
+ report_fatal_error(formatv(
+ "NVPTX LdStCode Printer does not support \"{}\" addsp modifier.",
+ AddressSpaceToString(A)));
+ } else if (Modifier == "sign") {
+ switch (Imm) {
+ case NVPTX::PTXLdStInstCode::Signed:
+ O << "s";
+ return;
+ case NVPTX::PTXLdStInstCode::Unsigned:
+ O << "u";
+ return;
+ case NVPTX::PTXLdStInstCode::Untyped:
+ O << "b";
+ return;
+ case NVPTX::PTXLdStInstCode::Float:
+ O << "f";
+ return;
+ default:
+ llvm_unreachable("Unknown register type");
+ }
+ } else if (Modifier == "vec") {
+ switch (Imm) {
+ case NVPTX::PTXLdStInstCode::V2:
+ O << ".v2";
+ return;
+ case NVPTX::PTXLdStInstCode::V4:
+ O << ".v4";
+ return;
+ }
+ // TODO: evaluate whether cases not covered by this switch are bugs
+ return;
+ }
+ llvm_unreachable(formatv("Unknown Modifier: {}", Modifier).str().c_str());
}
void NVPTXInstPrinter::printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O,
- const char *Modifier) {
+ const char *M) {
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int)MO.getImm();
- if (Modifier == nullptr || strcmp(Modifier, "version") == 0) {
+ llvm::StringRef Modifier(M);
+ if (Modifier.empty() || Modifier == "version") {
O << Imm; // Just print out PTX version
- } else if (strcmp(Modifier, "aligned") == 0) {
+ return;
+ } else if (Modifier == "aligned") {
// PTX63 requires '.aligned' in the name of the instruction.
if (Imm >= 63)
O << ".aligned";
- } else
- llvm_unreachable("Unknown Modifier");
+ return;
+ }
+ llvm_unreachable("Unknown Modifier");
}
void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum,
- raw_ostream &O, const char *Modifier) {
+ raw_ostream &O, const char *M) {
printOperand(MI, OpNum, O);
+ llvm::StringRef Modifier(M);
- if (Modifier && !strcmp(Modifier, "add")) {
+ if (Modifier == "add") {
O << ", ";
printOperand(MI, OpNum + 1, O);
} else {
@@ -346,24 +381,24 @@ void NVPTXInstPrinter::printPrmtMode(const MCInst *MI, int OpNum,
default:
return;
case NVPTX::PTXPrmtMode::NONE:
- break;
+ return;
case NVPTX::PTXPrmtMode::F4E:
O << ".f4e";
- break;
+ return;
case NVPTX::PTXPrmtMode::B4E:
O << ".b4e";
- break;
+ return;
case NVPTX::PTXPrmtMode::RC8:
O << ".rc8";
- break;
+ return;
case NVPTX::PTXPrmtMode::ECL:
O << ".ecl";
- break;
+ return;
case NVPTX::PTXPrmtMode::ECR:
O << ".ecr";
- break;
+ return;
case NVPTX::PTXPrmtMode::RC16:
O << ".rc16";
- break;
+ return;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index f6f6acb9e13c90..f6ab81d3ca0bb2 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -117,23 +117,37 @@ enum Ordering : OrderingUnderlyingType {
// Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire"
Acquire = (OrderingUnderlyingType)AtomicOrdering::Acquire,
Release = (OrderingUnderlyingType)AtomicOrdering::Release,
- // AcquireRelease = 6, // TODO
+ AcquireRelease = (OrderingUnderlyingType)AtomicOrdering::AcquireRelease,
SequentiallyConsistent =
(OrderingUnderlyingType)AtomicOrdering::SequentiallyConsistent,
Volatile = SequentiallyConsistent + 1,
RelaxedMMIO = Volatile + 1,
- LAST = RelaxedMMIO
+ LASTORDERING = RelaxedMMIO
};
-namespace PTXLdStInstCode {
-enum AddressSpace {
- GENERIC = 0,
- GLOBAL = 1,
- CONSTANT = 2,
- SHARED = 3,
- PARAM = 4,
- LOCAL = 5
+using ScopeUnderlyingType = unsigned int;
+enum Scope : ScopeUnderlyingType {
+ Thread = 0,
+ Block = 1,
+ Cluster = 2,
+ Device = 3,
+ System = 4,
+ LASTSCOPE = System
+};
+
+using AddressSpaceUnderlyingType = unsigned int;
+enum AddressSpace : AddressSpaceUnderlyingType {
+ Generic = 0,
+ Global = 1,
+ Shared = 3,
+ Const = 4,
+ Local = 5,
+
+ // NVPTX Backend Private:
+ Param = 101
};
+
+namespace PTXLdStInstCode {
enum FromType {
Unsigned = 0,
Signed,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 4f0bc1a2044642..56c96ea943b89d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -59,6 +59,7 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
+ Scopes = NVPTXScopes(MF.getFunction().getContext());
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -106,6 +107,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (tryStore(N))
return;
break;
+ case ISD::ATOMIC_FENCE:
+ if (tryFence(N))
+ return;
+ break;
case ISD::EXTRACT_VECTOR_ELT:
if (tryEXTRACT_VECTOR_ELEMENT(N))
return;
@@ -699,20 +704,26 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
const Value *Src = N->getMemOperand()->getValue();
if (!Src)
- return NVPTX::PTXLdStInstCode::GENERIC;
+ return NVPTX::AddressSpace::Generic;
if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
switch (PT->getAddressSpace()) {
- case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
- case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
- case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
- case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
- case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
- case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
+ case llvm::ADDRESS_SPACE_LOCAL:
+ return NVPTX::AddressSpace::Local;
+ case llvm::ADDRESS_SPACE_GLOBAL:
+ return NVPTX::AddressSpace::Global;
+ case llvm::ADDRESS_SPACE_SHARED:
+ return NVPTX::AddressSpace::Shared;
+ case llvm::ADDRESS_SPACE_GENERIC:
+ return NVPTX::AddressSpace::Generic;
+ case llvm::ADDRESS_SPACE_PARAM:
+ return NVPTX::AddressSpace::Param;
+ case llvm::ADDRESS_SPACE_CONST:
+ return NVPTX::AddressSpace::Const;
default: break;
}
}
- return NVPTX::PTXLdStInstCode::GENERIC;
+ return NVPTX::AddressSpace::Generic;
}
namespace {
@@ -815,9 +826,9 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
// - the "weak" memory instruction we are currently lowering to, and
// - some other instruction that preserves the side-effect, e.g.,
// a dead dummy volatile load.
- if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
+ if (CodeAddrSpace == NVPTX::AddressSpace::Local ||
+ CodeAddrSpace == NVPTX::AddressSpace::Const ||
+ CodeAddrSpace == NVPTX::AddressSpace::Param) {
return NVPTX::Ordering::NotAtomic;
}
@@ -842,14 +853,14 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
// atomics is undefined if the generic address does not refer to a .global or
// .shared memory location.
bool AddrGenericOrGlobalOrShared =
- (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
+ (CodeAddrSpace == NVPTX::AddressSpace::Generic ||
+ CodeAddrSpace == NVPTX::AddressSpace::Global ||
+ CodeAddrSpace == NVPTX::AddressSpace::Shared);
if (!AddrGenericOrGlobalOrShared)
return NVPTX::Ordering::NotAtomic;
bool UseRelaxedMMIO =
- HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
+ HasRelaxedMMIO && CodeAddrSpace == NVPTX::AddressSpace::Global;
switch (Ordering) {
case AtomicOrdering::NotAtomic:
@@ -915,6 +926,40 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
} // namespace
+NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
+ NVPTX::Ordering O) const {
+ switch (O) {
+ case NVPTX::Ordering::NotAtomic:
+ case NVPTX::Ordering::Volatile: // Non-atomic volatile operations
+ // NVPTX uses Thread scope as the scope of non-atomic operations.
+ return NVPTX::Scope::Thread;
+ case NVPTX::Ordering::RelaxedMMIO:
+ // RelaxedMMIO operations are always system scope.
+ // If a RelaxedMMIO order was generated from an atomic volatile operation
+ // with a smaller thread scope, we bump it here to system scope.
+ return NVPTX::Scope::System;
+ case NVPTX::Ordering::Relaxed:
+ case NVPTX::Ordering::Acquire:
+ case NVPTX::Ordering::Release:
+ case NVPTX::Ordering::AcquireRelease:
+ case NVPTX::Ordering::SequentiallyConsistent:
+ auto S = Scopes[N->getSyncScopeID()];
+
+ // Atomic operations must have a scope greater than thread.
+ if (S == NVPTX::Scope::Thread)
+ report_fatal_error(
+ formatv("Atomics need scope > \"{}\".", ScopeToString(S)));
+
+ // If scope is cluster, clusters must be supported.
+ if (S == NVPTX::Scope::Cluster)
+ Subtarget->failIfClustersUnsupported("cluster scope");
+
+ // If operation is volatile, then its scope is system.
+ return N->isVolatile() ? NVPTX::Scope::System : S;
+ }
+ llvm_unreachable("unhandled ordering");
+}
+
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
unsigned CodeAddrSpace, MachineFunction *F) {
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -934,7 +979,7 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
// TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
// explicitly invariant loads because these are how clang tells us to use ldg
// when the user uses a builtin.
- if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
+ if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::AddressSpace::Global)
return false;
if (N->isInvariant())
@@ -957,33 +1002,87 @@ static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
});
}
-NVPTX::Ordering NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL,
- SDValue &Chain,
- MemSDNode *N) {
- // Some memory instructions - loads, stores, atomics - need an extra fence
- // instruction. Get the memory order of the instruction, and that of its
- // fence, if any.
+static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
+ NVPTXSubtarget const *T) {
+ if (S == NVPTX::Scope::Cluster)
+ T->failIfClustersUnsupported(".cluster scope fence");
+
+ switch (O) {
+ case NVPTX::Ordering::Acquire:
+ case NVPTX::Ordering::Release:
+ case NVPTX::Ordering::AcquireRelease: {
+ switch (S) {
+ case NVPTX::Scope::System:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_sys
+ : NVPTX::INT_MEMBAR_SYS;
+ case NVPTX::Scope::Block:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_cta
+ : NVPTX::INT_MEMBAR_CTA;
+ case NVPTX::Scope::Cluster:
+ return NVPTX::atomic_thread_fence_acq_rel_cluster;
+ case NVPTX::Scope::Device:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acq_rel_gpu
+ : NVPTX::INT_MEMBAR_GL;
+ case NVPTX::Scope::Thread:
+ report_fatal_error(
+ formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
+ ScopeToString(S)));
+ }
+ }
+ case NVPTX::Ordering::SequentiallyConsistent: {
+ switch (S) {
+ case NVPTX::Scope::System:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_sys
+ : NVPTX::INT_MEMBAR_SYS;
+ case NVPTX::Scope::Block:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_cta
+ : NVPTX::INT_MEMBAR_CTA;
+ case NVPTX::Scope::Cluster:
+ return NVPTX::atomic_thread_fence_seq_cst_cluster;
+ case NVPTX::Scope::Device:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_seq_cst_gpu
+ : NVPTX::INT_MEMBAR_GL;
+ case NVPTX::Scope::Thread:
+ report_fatal_error(formatv("Unsupported scope \"{}\" for seq_cst fence.",
+ ScopeToString(S)));
+ }
+ }
+ case NVPTX::Ordering::NotAtomic:
+ case NVPTX::Ordering::Relaxed:
+ case NVPTX::Ordering::Volatile:
+ case NVPTX::Ordering::RelaxedMMIO:
+ report_fatal_error(
+ formatv("Unsupported \"{}\" ordering and \"{}\" scope for fence.",
+ OrderingToString(O), ScopeToString(S)));
+ }
+ llvm_unreachable("unhandled ordering");
+}
+
+// Returns Memory Order and Scope of a memory instruction, and
+// inserts any fence before the instruction that's required to
+// implement its memory ordering.
+std::pair<NVPTX::Ordering, NVPTX::Scope>
+NVPTXDAGToDAGISel::insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
+ MemSDNode *N) {
auto [InstructionOrdering, FenceOrdering] =
getOperationOrderings(N, Subtarget);
+ auto Scope = getOperationScope(N, InstructionOrdering);
// If a fence is required before the operation, insert it:
switch (NVPTX::Ordering(FenceOrdering)) {
case NVPTX::Ordering::NotAtomic:
break;
case NVPTX::Ordering::SequentiallyConsistent: {
- unsigned Op = Subtarget->hasMemoryOrdering()
- ? NVPTX::atomic_thread_fence_seq_cst_sys
- : NVPTX::INT_MEMBAR_SYS;
+ auto Op = getFenceOp(FenceOrdering, Scope, Subtarget);
Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
break;
}
default:
report_fatal_error(
formatv("Unexpected fence ordering: \"{}\".",
- OrderingToCString(NVPTX::Ordering(FenceOrdering))));
+ OrderingToString(NVPTX::Ordering(FenceOrdering))));
}
-
- return InstructionOrdering;
+ return {InstructionOrdering, Scope};
}
bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
@@ -1154,7 +1253,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
- auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, LD);
+ auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, LD);
// Type Setting: fromType + fromTypeWidth
//
@@ -1189,7 +1288,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
std::optional<unsigned> Opcode;
MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
- SmallVector<SDValue, 12> Ops({getI32Imm(InstructionOrdering, DL),
+ SmallVector<SDValue, 12> Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL), getI32Imm(FromType, DL),
getI32Imm(FromTypeWidth, DL)});
@@ -1266,7 +1365,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
- auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD);
+ auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
// Vector Setting
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1319,7 +1418,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
std::optional<unsigned> Opcode;
SDNode *LD;
- SmallVector<SDValue, 12> Ops({getI32Imm(InstructionOrdering, DL),
+ SmallVector<SDValue, 12> Ops({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL), getI32Imm(FromType, DL),
getI32Imm(FromTypeWidth, DL)});
@@ -1895,7 +1994,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
SDLoc DL(N);
SDValue Chain = ST->getChain();
- auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, ST);
+ auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
// Vector Setting
MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1923,10 +2022,10 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
MVT::SimpleValueType SourceVT =
Value.getNode()->getSimpleValueType(0).SimpleTy;
- SmallVector<SDValue, 12> Ops({Value, getI32Imm(InstructionOrdering, DL),
- getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(ToType, DL),
- getI32Imm(ToTypeWidth, DL)});
+ SmallVector<SDValue, 12> Ops(
+ {Value, getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
+ getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
+ getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)});
if (SelectDirectAddr(BasePtr, Addr)) {
Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
@@ -1996,7 +2095,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
// Address Space Setting
unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
- if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
+ if (CodeAddrSpace == NVPTX::AddressSpace::Const) {
report_fatal_error("Cannot store to pointer that points to constant "
"memory space");
}
@@ -2005,7 +2104,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
- auto InstructionOrdering = insertMemoryInstructionFence(DL, Chain, MemSD);
+ auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, MemSD);
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
@@ -2044,9 +2143,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
ToTypeWidth = 32;
}
- Ops.append({getI32Imm(InstructionOrdering, DL), getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(ToType, DL),
- getI32Imm(ToTypeWidth, DL)});
+ Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
+ getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
+ getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL)});
if (SelectDirectAddr(N2, Addr)) {
switch (N->getOpcode()) {
@@ -4064,3 +4163,41 @@ unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
}
}
}
+
+bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
+ SDLoc DL(N);
+ assert(N->getOpcode() == ISD::ATOMIC_FENCE);
+ unsigned int FenceOp =
+ getFenceOp(NVPTX::Ordering(N->getConstantOperandVal(1)),
+ Scopes[N->getConstantOperandVal(2)], Subtarget);
+ SDValue Chain = N->getOperand(0);
+ SDNode *FenceNode = CurDAG->getMachineNode(FenceOp, DL, MVT::Other, Chain);
+ ReplaceNode(N, FenceNode);
+ return true;
+}
+
+NVPTXScopes::NVPTXScopes(LLVMContext &C) {
+ Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
+ Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
+ Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
+ Scopes[C.getOrInsertSyncScopeID("cluster")] = NVPTX::Scope::Cluster;
+ Scopes[C.getOrInsertSyncScopeID("device")] = NVPTX::Scope::Device;
+}
+
+NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
+ if (Scopes.empty())
+ llvm_unreachable("NVPTX Scopes must be initialized before calling "
+ "NVPTXScopes::operator[]");
+
+ auto S = Scopes.find(ID);
+ if (S == Scopes.end()) {
+ // TODO:
+ // - Add API to LLVMContext to get the name of a single scope.
+ // - Use that API here to print an error containing the name
+ // of this Unknown ID.
+ report_fatal_error(formatv("Could not find scope ID={}.", int(ID)));
+ }
+ return S->second;
+}
+
+bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index eac4056599511c..c128c082c29837 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -18,13 +18,25 @@
#include "NVPTXISelLowering.h"
#include "NVPTXRegisterInfo.h"
#include "NVPTXTargetMachine.h"
+#include "llvm/ADT/MapVector.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/Compiler.h"
namespace llvm {
+struct NVPTXScopes {
+ NVPTXScopes() = default;
+ NVPTXScopes(LLVMContext &C);
+ NVPTX::Scope operator[](SyncScope::ID ID) const;
+ bool empty() const;
+
+private:
+ SmallMapVector<SyncScope::ID, NVPTX::Scope, 8> Scopes{};
+};
+
class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
const NVPTXTargetMachine &TM;
@@ -38,6 +50,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool allowUnsafeFPMath() const;
bool doRsqrtOpt() const;
+ NVPTXScopes Scopes{};
+
public:
NVPTXDAGToDAGISel() = delete;
@@ -66,6 +80,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryLoadParam(SDNode *N);
bool tryStoreRetval(SDNode *N);
bool tryStoreParam(SDNode *N);
+ bool tryFence(SDNode *N);
void SelectAddrSpaceCast(SDNode *N);
bool tryTextureIntrinsic(SDNode *N);
bool trySurfaceIntrinsic(SDNode *N);
@@ -100,8 +115,13 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
static unsigned GetConvertOpcode(MVT DestTy, MVT SrcTy, LoadSDNode *N);
- NVPTX::Ordering insertMemoryInstructionFence(SDLoc DL, SDValue &Chain,
- MemSDNode *N);
+ // Returns the Memory Order and Scope that the PTX memory instruction should
+ // use, and inserts appropriate fence instruction before the memory
+ // instruction, if needed to implement the instructions memory order. Required
+ // fences after the instruction need to be handled elsewhere.
+ std::pair<NVPTX::Ordering, NVPTX::Scope>
+ insertMemoryInstructionFence(SDLoc DL, SDValue &Chain, MemSDNode *N);
+ NVPTX::Scope getOperationScope(MemSDNode *N, NVPTX::Ordering O) const;
};
class NVPTXDAGToDAGISelLegacy : public SelectionDAGISelLegacy {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b7e210805db904..510e4b81003119 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2971,39 +2971,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
multiclass LD<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _ari : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _ari_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _asi : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
}
@@ -3019,39 +3019,42 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _ari : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr,
+ i32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr,
+ i32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _asi : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
- LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
+ LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, imem:$addr,
+ i32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
}
@@ -3070,75 +3073,75 @@ let mayStore=1, hasSideEffects=0 in {
multiclass LD_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v4_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
}
let mayLoad=1, hasSideEffects=0 in {
@@ -3153,84 +3156,87 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ imem:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int32Regs:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int64Regs:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_ari : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
- i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int32Regs:$addr, i32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
- i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ Int64Regs:$addr, i32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_asi : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
- LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
- i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
+ LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
+ imem:$addr, i32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v4_avar : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_asi : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
- i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+ LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
+ LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}"
"$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
}
@@ -4003,17 +4009,23 @@ def atomic_thread_fence_acq_rel_sys :
NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
Requires<[hasPTX<60>, hasSM<70>]>;
-def : Pat<(atomic_fence (i64 4), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acquire(4) sys(1)
- Requires<[hasPTX<60>, hasSM<70>]>;
-def : Pat<(atomic_fence (i64 5), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // release(5) sys(1)
- Requires<[hasPTX<60>, hasSM<70>]>;
-def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acq_rel(6) sys(1)
- Requires<[hasPTX<60>, hasSM<70>]>;
-def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys)>, // seq_cst(7) sys(1)
- Requires<[hasPTX<60>, hasSM<70>]>;
-
-// If PTX<60 or SM<70, we fall back to MEMBAR:
-def : Pat<(atomic_fence (i64 4), (i64 1)), (INT_MEMBAR_SYS)>; // acquire(4) sys(1)
-def : Pat<(atomic_fence (i64 5), (i64 1)), (INT_MEMBAR_SYS)>; // release(5) sys(1)
-def : Pat<(atomic_fence (i64 6), (i64 1)), (INT_MEMBAR_SYS)>; // acq_rel(6) sys(1)
-def : Pat<(atomic_fence (i64 7), (i64 1)), (INT_MEMBAR_SYS)>; // seq_cst(7) sys(1)
+def atomic_thread_fence_seq_cst_gpu :
+ NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+def atomic_thread_fence_acq_rel_gpu :
+ NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+
+def atomic_thread_fence_seq_cst_cluster :
+ NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+def atomic_thread_fence_acq_rel_cluster :
+ NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>,
+ Requires<[hasPTX<78>, hasSM<90>]>;
+
+def atomic_thread_fence_seq_cst_cta :
+ NVPTXInst<(outs), (ins), "fence.sc.cta;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+def atomic_thread_fence_acq_rel_cta :
+ NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
\ No newline at end of file
diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index f2515f971595bf..f66504b09cb63f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -11,12 +11,11 @@
// to work reliably, inlining of all function call must be performed.
//
//===----------------------------------------------------------------------===//
-
+#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTX.h"
#include "NVPTXMachineFunctionInfo.h"
#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
-#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -1820,8 +1819,8 @@ findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
return false;
}
- assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!");
- StringRef Sym = TexHandleDef.getOperand(6).getSymbolName();
+ assert(TexHandleDef.getOperand(7).isSymbol() && "Load is not a symbol!");
+ StringRef Sym = TexHandleDef.getOperand(7).getSymbolName();
std::string ParamBaseName = std::string(MF.getName());
ParamBaseName += "_param_";
assert(Sym.starts_with(ParamBaseName) && "Invalid symbol reference");
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 420065585b3849..0e6b75e622c6ad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -12,6 +12,8 @@
#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
using namespace llvm;
@@ -69,3 +71,14 @@ bool NVPTXSubtarget::hasImageHandles() const {
bool NVPTXSubtarget::allowFP16Math() const {
return hasFP16Math() && NoF16Math == false;
}
+
+void NVPTXSubtarget::failIfClustersUnsupported(
+ std::string const &FailureMessage) const {
+ if (hasClusters())
+ return;
+
+ report_fatal_error(formatv(
+ "NVPTX SM architecture \"{}\" and PTX version \"{}\" do not support {}. "
+ "Requires SM >= 90 and PTX >= 78.",
+ getFullSmVersion(), PTXVersion, FailureMessage));
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 457f10f1d64a26..8b9059bd60cbd4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -78,6 +78,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool hasAtomBitwise64() const { return SmVersion >= 32; }
bool hasAtomMinMax64() const { return SmVersion >= 32; }
bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
+ bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
bool hasLDG() const { return SmVersion >= 32; }
bool hasHWROT32() const { return SmVersion >= 32; }
bool hasImageHandles() const;
@@ -119,6 +120,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
+
+ void failIfClustersUnsupported(std::string const &FailureMessage) const;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index eebd91fefe4f03..938b9b04b7a449 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -20,6 +20,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Alignment.h"
+#include "llvm/Support/FormatVariadic.h"
#include <cstdarg>
#include <set>
#include <string>
@@ -86,7 +87,7 @@ bool Isv2x16VT(EVT VT);
namespace NVPTX {
-inline std::string OrderingToCString(Ordering Order) {
+inline std::string OrderingToString(Ordering Order) {
switch (Order) {
case Ordering::NotAtomic:
return "NotAtomic";
@@ -96,7 +97,8 @@ inline std::string OrderingToCString(Ordering Order) {
return "Acquire";
case Ordering::Release:
return "Release";
- // case Ordering::AcquireRelease: return "AcquireRelease";
+ case Ordering::AcquireRelease:
+ return "AcquireRelease";
case Ordering::SequentiallyConsistent:
return "SequentiallyConsistent";
case Ordering::Volatile:
@@ -104,11 +106,58 @@ inline std::string OrderingToCString(Ordering Order) {
case Ordering::RelaxedMMIO:
return "RelaxedMMIO";
}
- report_fatal_error("unknown ordering");
+ report_fatal_error(formatv("Unknown NVPTX::Ordering \"{}\".",
+ static_cast<OrderingUnderlyingType>(Order)));
}
inline raw_ostream &operator<<(raw_ostream &O, Ordering Order) {
- O << OrderingToCString(Order);
+ O << OrderingToString(Order);
+ return O;
+}
+
+inline std::string ScopeToString(Scope S) {
+ switch (S) {
+ case Scope::Thread:
+ return "Thread";
+ case Scope::System:
+ return "System";
+ case Scope::Block:
+ return "Block";
+ case Scope::Cluster:
+ return "Cluster";
+ case Scope::Device:
+ return "Device";
+ }
+ report_fatal_error(formatv("Unknown NVPTX::Scope \"{}\".",
+ static_cast<ScopeUnderlyingType>(S)));
+}
+
+inline raw_ostream &operator<<(raw_ostream &O, Scope S) {
+ O << ScopeToString(S);
+ return O;
+}
+
+inline std::string AddressSpaceToString(AddressSpace A) {
+ switch (A) {
+ case AddressSpace::Generic:
+ return "generic";
+ case AddressSpace::Global:
+ return "global";
+ case AddressSpace::Const:
+ return "const";
+ case AddressSpace::Shared:
+ return "shared";
+ case AddressSpace::Param:
+ return "param";
+ case AddressSpace::Local:
+ return "local";
+ }
+ report_fatal_error(formatv("Unknown NVPTX::AddressSpace \"{}\".",
+ static_cast<AddressSpaceUnderlyingType>(A)));
+}
+
+inline raw_ostream &operator<<(raw_ostream &O, AddressSpace A) {
+ O << AddressSpaceToString(A);
return O;
}
diff --git a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
index 58e2e644b000fe..a40b4d85773b29 100644
--- a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
+++ b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
@@ -40,9 +40,9 @@ registers:
- { id: 7, class: float32regs }
body: |
bb.0.entry:
- %0 = LD_f32_avar 0, 4, 1, 2, 32, &test_param_0
+ %0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test_param_0
%1 = CVT_f64_f32 %0, 0
- %2 = LD_i32_avar 0, 4, 1, 0, 32, &test_param_1
+ %2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test_param_1
; CHECK: %3:float64regs = FADD_rnf64ri %1, double 3.250000e+00
%3 = FADD_rnf64ri %1, double 3.250000e+00
%4 = CVT_f32_f64 %3, 5
@@ -66,9 +66,9 @@ registers:
- { id: 7, class: float32regs }
body: |
bb.0.entry:
- %0 = LD_f32_avar 0, 4, 1, 2, 32, &test2_param_0
+ %0 = LD_f32_avar 0, 0, 4, 1, 2, 32, &test2_param_0
%1 = CVT_f64_f32 %0, 0
- %2 = LD_i32_avar 0, 4, 1, 0, 32, &test2_param_1
+ %2 = LD_i32_avar 0, 0, 4, 1, 0, 32, &test2_param_1
; CHECK: %3:float64regs = FADD_rnf64ri %1, double 0x7FF8000000000000
%3 = FADD_rnf64ri %1, double 0x7FF8000000000000
%4 = CVT_f32_f64 %3, 5
diff --git a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
new file mode 100644
index 00000000000000..82eb5fb71677b6
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+
+; CHECK-LABEL: fence_sc_cluster
+define void @fence_sc_cluster() local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ fence syncscope("cluster") seq_cst
+ ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_cluster
+define void @fence_acq_rel_cluster() local_unnamed_addr {
+ ; CHECK: fence.acq_rel.cluster
+ fence syncscope("cluster") acq_rel
+ ret void
+}
+
+; CHECK-LABEL: fence_release_cluster
+define void @fence_release_cluster() local_unnamed_addr {
+ ; CHECK: fence.acq_rel.cluster
+ fence syncscope("cluster") release
+ ret void
+}
+
+; CHECK-LABEL: fence_acquire_cluster
+define void @fence_acquire_cluster() local_unnamed_addr {
+ ; CHECK: fence.acq_rel.cluster
+ fence syncscope("cluster") acquire
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll
index d3aace95e96650..626685f82f32ca 100644
--- a/llvm/test/CodeGen/NVPTX/fence.ll
+++ b/llvm/test/CodeGen/NVPTX/fence.ll
@@ -3,6 +3,8 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70
; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %}
+; TODO: implement and test thread scope.
+
; CHECK-LABEL: fence_sc_sys
define void @fence_sc_sys() local_unnamed_addr {
; SM60: membar.sys
@@ -16,21 +18,85 @@ define void @fence_acq_rel_sys() local_unnamed_addr {
; SM60: membar.sys
; SM70: fence.acq_rel.sys
fence acq_rel
- ret void
+ ret void
}
; CHECK-LABEL: fence_release_sys
define void @fence_release_sys() local_unnamed_addr {
; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
+ ; SM70: fence.acq_rel.sys
fence release
- ret void
+ ret void
}
; CHECK-LABEL: fence_acquire_sys
define void @fence_acquire_sys() local_unnamed_addr {
; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
+ ; SM70: fence.acq_rel.sys
fence acquire
- ret void
+ ret void
+}
+
+; CHECK-LABEL: fence_sc_gpu
+define void @fence_sc_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.sc.gpu
+ fence syncscope("device") seq_cst
+ ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_gpu
+define void @fence_acq_rel_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.acq_rel.gpu
+ fence syncscope("device") acq_rel
+ ret void
+}
+
+; CHECK-LABEL: fence_release_gpu
+define void @fence_release_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.acq_rel.gpu
+ fence syncscope("device") release
+ ret void
+}
+
+; CHECK-LABEL: fence_acquire_gpu
+define void @fence_acquire_gpu() local_unnamed_addr {
+ ; SM60: membar.gl
+ ; SM70: fence.acq_rel.gpu
+ fence syncscope("device") acquire
+ ret void
+}
+
+; CHECK-LABEL: fence_sc_cta
+define void @fence_sc_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.sc.cta
+ fence syncscope("block") seq_cst
+ ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_cta
+define void @fence_acq_rel_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.acq_rel.cta
+ fence syncscope("block") acq_rel
+ ret void
+}
+
+; CHECK-LABEL: fence_release_cta
+define void @fence_release_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.acq_rel.cta
+ fence syncscope("block") release
+ ret void
+}
+
+; CHECK-LABEL: fence_acquire_cta
+define void @fence_acquire_cta() local_unnamed_addr {
+ ; SM60: membar.cta
+ ; SM70: fence.acq_rel.cta
+ fence syncscope("block") acquire
+ ret void
}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
index 9cea33d12027f2..4b200eacb0cf4a 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -1,10 +1,367 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s
; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
+; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;"
+; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile release": generates "st.release.sys;"
+; but should generate "fence.acq_rel.sys; st.mmio.relaxed.sys;"
+
+; TODO: fix "atomic load volatile seq_cst": generates "fence.sc.sys; ld.acquire.sys;"
+; but should generate "fence.sc.sys; ld.relaxed.mmio.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile seq_cst": generates "fence.sc.sys; st.release.sys;"
+; but should generate "fence.sc.sys; st.relaxed.mmio.sys;"
+
+; TODO: add i1, <8 x i8>, and <6 x i8> vector tests.
+
+; TODO: add test for vectors that exceed 128-bit length
+; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+; TODO: generate PTX that preserves Concurrent Forward Progress
+; for atomic operations to local statespace
+; by generating atomic or volatile operations.
+
+; TODO: design exposure for atomic operations on vector types.
+
+; TODO: implement and test thread scope.
+
+; TODO: add weak,atomic,volatile,atomic volatile tests
+; for .const and .param statespaces.
+
+; TODO: optimize .sys.shared into .cta.shared or .cluster.shared .
+
;; generic statespace
-; CHECK-LABEL: generic_acq_rel
-define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_unordered_gpu
+define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile_gpu
+define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_cta
+define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile_cta
+define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_gpu
+define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_volatile_gpu
+define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_cta
+define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_volatile_cta
+define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_sys
+define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -31,7 +388,7 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr %e release, align 4
@@ -44,8 +401,8 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam
ret void
}
-; CHECK-LABEL: generic_acq_rel_volatile
-define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_acq_rel_volatile_sys
+define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -72,7 +429,7 @@ define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) lo
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr %e release, align 4
@@ -85,8 +442,172 @@ define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) lo
ret void
}
-; CHECK-LABEL: generic_sc
-define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_acq_rel_gpu
+define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_volatile_gpu
+define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_cta
+define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_volatile_cta
+define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_sys
+define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr %a seq_cst, align 1
@@ -122,7 +643,7 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr %e seq_cst, align 4
@@ -138,8 +659,8 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad
ret void
}
-; CHECK-LABEL: generic_sc_volatile
-define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_sc_volatile_sys
+define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr %a seq_cst, align 1
@@ -175,7 +696,7 @@ define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
; CHECK: fence.sc.sys
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr %e seq_cst, align 4
@@ -191,393 +712,2338 @@ define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u
ret void
}
-;; global statespace
-
-; CHECK-LABEL: global_acq_rel
-define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1
+; CHECK-LABEL: generic_sc_gpu
+define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(1) %a release, align 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
- ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b release, align 2
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
- ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c release, align 4
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
- ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d release, align 8
-
- ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e release, align 4
-
- ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e release, align 8
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("device") seq_cst, align 8
ret void
}
-; CHECK-LABEL: global_acq_rel_volatile
-define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1
+; CHECK-LABEL: generic_sc_volatile_gpu
+define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("device") seq_cst, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("device") seq_cst, align 1
- ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("device") seq_cst, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("device") seq_cst, align 2
- ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("device") seq_cst, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("device") seq_cst, align 4
- ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("device") seq_cst, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("device") seq_cst, align 8
- ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("device") seq_cst, align 4
- ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("device") seq_cst, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_cta
+define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("block") seq_cst, align 8
ret void
}
-; CHECK-LABEL: global_seq_cst
-define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: generic_sc_volatile_cta
+define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("block") seq_cst, align 1
%a.add = add i8 %a.load, 1
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("block") seq_cst, align 1
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("block") seq_cst, align 2
%b.add = add i16 %b.load, 1
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("block") seq_cst, align 2
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("block") seq_cst, align 4
%c.add = add i32 %c.load, 1
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("block") seq_cst, align 4
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("block") seq_cst, align 8
%d.add = add i64 %d.load, 1
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("block") seq_cst, align 8
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("block") seq_cst, align 4
; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("block") seq_cst, align 8
%f.add = fadd double %f.load, 1.
; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("block") seq_cst, align 8
ret void
}
-; CHECK-LABEL: global_seq_cst_volatile
-define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1
+;; global statespace
+
+; CHECK-LABEL: global_unordered_gpu
+define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
+ ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2
+ ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
+ ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4
+ ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
+ ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8
+ ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
+ ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4
+ ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8
+ ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") unordered, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8
+ ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
ret void
}
-;; shared statespace
+; CHECK-LABEL: global_unordered_volatile_gpu
+define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1
-; CHECK-LABEL: shared_acq_rel
-define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_unordered_cta
+define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(3) %a release, align 1
+ ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
- ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2
+ ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b release, align 2
+ ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
- ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4
+ ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c release, align 4
+ ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
- ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8
+ ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d release, align 8
+ ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
- ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e release, align 4
+ ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
- ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8
+ ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e release, align 8
+ ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
ret void
}
-; CHECK-LABEL: shared_acq_rel_volatile
-define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1
+; CHECK-LABEL: global_unordered_volatile_cta
+define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") unordered, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1
- ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") unordered, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2
- ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") unordered, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4
- ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") unordered, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8
- ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4
- ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") unordered, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8
ret void
}
-; CHECK-LABEL: shared_seq_cst
-define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1
+; CHECK-LABEL: global_monotonic_gpu
+define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
+ ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2
+ ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
+ ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4
+ ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
+ ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8
+ ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
+ ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4
+ ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8
+ ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
ret void
}
-; CHECK-LABEL: shared_seq_cst_volatile
-define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1
+; CHECK-LABEL: global_monotonic_volatile_gpu
+define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_cta
+define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_volatile_cta
+define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_sys
+define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile_sys
+define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_gpu
+define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile_gpu
+define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_cta
+define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile_cta
+define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_sys
+define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_volatile_sys
+define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_gpu
+define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_volatile_gpu
+define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_cta
+define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_volatile_cta
+define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+;; shared statespace
+
+; CHECK-LABEL: shared_unordered_gpu
+define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile_gpu
+define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_cta
+define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile_cta
+define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_gpu
+define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile_gpu
+define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_cta
+define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile_cta
+define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_sys
+define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile_sys
+define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_gpu
+define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile_gpu
+define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_cta
+define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile_cta
+define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_sys
+define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_volatile_sys
+define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_gpu
+define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.gpu
+ ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.gpu
+ ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_volatile_gpu
+define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_cta
+define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.cta
+ ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cta
+ ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_volatile_cta
+define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
+
+;; local statespace
+
+; CHECK-LABEL: local_unordered_gpu
+define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile_gpu
+define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_cta
+define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile_cta
+define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_gpu
+define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile_gpu
+define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_cta
+define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile_cta
+define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4
- ; CHECK: fence.sc.sys
- ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") monotonic, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: fence.sc.sys
- ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8
ret void
}
-;; local statespace
-
-; CHECK-LABEL: local_acq_rel
-define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
-
+; CHECK-LABEL: local_acq_rel_sys
+define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -604,7 +3070,7 @@ define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(5) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(5) %e release, align 4
@@ -617,11 +3083,8 @@ define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
ret void
}
-; CHECK-LABEL: local_acq_rel_volatile
-define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
-
+; CHECK-LABEL: local_acq_rel_volatile_sys
+define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1
%a.add = add i8 %a.load, 1
@@ -648,7 +3111,7 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4
@@ -661,11 +3124,172 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
ret void
}
-; CHECK-LABEL: local_seq_cst
-define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
+; CHECK-LABEL: local_acq_rel_gpu
+define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_volatile_gpu
+define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_cta
+define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_volatile_cta
+define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8
+
+ ret void
+}
+; CHECK-LABEL: local_seq_cst_sys
+define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
@@ -692,7 +3316,7 @@ define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4
@@ -705,11 +3329,8 @@ define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspa
ret void
}
-; CHECK-LABEL: local_seq_cst_volatile
-define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
-
+; CHECK-LABEL: local_seq_cst_volatile_sys
+define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
@@ -736,7 +3357,7 @@ define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4
@@ -746,10 +3367,169 @@ define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8
- ; TODO: LLVM IR Verifier does not support atomics on vector types.
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_gpu
+define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_volatile_gpu
+define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_cta
+define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
ret void
}
-; TODO: add plain,atomic,volatile,atomic volatile tests
-; for .const and .param statespaces
\ No newline at end of file
+; CHECK-LABEL: local_seq_cst_volatile_cta
+define void @local_seq_cst_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
new file mode 100644
index 00000000000000..645170da51a011
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll
@@ -0,0 +1,1423 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
+
+; TODO: fix "atomic load volatile acquire": generates "ld.acquire.sys;"
+; but should generate "ld.mmio.relaxed.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile release": generates "st.release.sys;"
+; but should generate "fence.acq_rel.sys; st.mmio.relaxed.sys;"
+
+; TODO: fix "atomic load volatile seq_cst": generates "fence.sc.sys; ld.acquire.sys;"
+; but should generate "fence.sc.sys; ld.relaxed.mmio.sys; fence.acq_rel.sys;"
+; TODO: fix "atomic store volatile seq_cst": generates "fence.sc.sys; st.release.sys;"
+; but should generate "fence.sc.sys; st.relaxed.mmio.sys;"
+
+; TODO: add i1, <8 x i8>, and <6 x i8> vector tests.
+
+; TODO: add test for vectors that exceed 128-bit length
+; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+; TODO: generate PTX that preserves Concurrent Forward Progress
+; for atomic operations to local statespace
+; by generating atomic or volatile operations.
+
+; TODO: design exposure for atomic operations on vector types.
+
+; TODO: implement and test thread scope.
+
+; TODO: add weak,atomic,volatile,atomic volatile tests
+; for .const and .param statespaces.
+
+; TODO: optimize .shared.sys into .shared.cta or .shared.cluster .
+
+;; generic statespace
+
+; CHECK-LABEL: generic_unordered_cluster
+define void @generic_unordered_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_unordered_volatile_cluster
+define void @generic_unordered_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_cluster
+define void @generic_monotonic_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_volatile_cluster
+define void @generic_monotonic_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_cluster
+define void @generic_acq_rel_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel_volatile_cluster
+define void @generic_acq_rel_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_cluster
+define void @generic_sc_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_sc_volatile_cluster
+define void @generic_sc_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+;; global statespace
+
+; CHECK-LABEL: global_unordered_cluster
+define void @global_unordered_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_unordered_volatile_cluster
+define void @global_unordered_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_cluster
+define void @global_monotonic_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_volatile_cluster
+define void @global_monotonic_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_cluster
+define void @global_acq_rel_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile_cluster
+define void @global_acq_rel_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_cluster
+define void @global_seq_cst_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_seq_cst_volatile_cluster
+define void @global_seq_cst_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+;; shared
+
+; CHECK-LABEL: shared_unordered_cluster
+define void @shared_unordered_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_unordered_volatile_cluster
+define void @shared_unordered_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_cluster
+define void @shared_monotonic_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile_cluster
+define void @shared_monotonic_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_cluster
+define void @shared_acq_rel_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile_cluster
+define void @shared_acq_rel_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_cluster
+define void @shared_seq_cst_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.cluster
+ ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.cluster
+ ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_seq_cst_volatile_cluster
+define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+;; local statespace
+
+; CHECK-LABEL: local_unordered_cluster
+define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_unordered_volatile_cluster
+define void @local_unordered_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_cluster
+define void @local_monotonic_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile_cluster
+define void @local_monotonic_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_cluster
+define void @local_acq_rel_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_volatile_cluster
+define void @local_acq_rel_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_cluster
+define void @local_seq_cst_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_seq_cst_volatile_cluster
+define void @local_seq_cst_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index aac73f71a6766c..f922fd92fa244e 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -9,10 +9,21 @@
; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+; TODO: generate PTX that preserves Concurrent Forward Progress
+; for atomic operations to local statespace
+; by generating atomic or volatile operations.
+
+; TODO: design exposure for atomic operations on vector types.
+
+; TODO: add weak,atomic,volatile,atomic volatile tests
+; for .const and .param statespaces.
+
+; TODO: optimize .sys.shared into .cta.shared or .cluster.shared .
+
; generic statespace
-; CHECK-LABEL: generic_plain
-define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+; CHECK-LABEL: generic_weak
+define void @generic_weak(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr %a
%a.add = add i8 %a.load, 1
@@ -238,198 +249,198 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr
ret void
}
-; CHECK-LABEL: generic_monotonic
-define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_unordered_sys
+define void @generic_unordered_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr %a monotonic, align 1
+ %a.load = load atomic i8, ptr %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr %a monotonic, align 1
+ store atomic i8 %a.add, ptr %a unordered, align 1
; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr %b monotonic, align 2
+ %b.load = load atomic i16, ptr %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr %b monotonic, align 2
+ store atomic i16 %b.add, ptr %b unordered, align 2
; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr %c monotonic, align 4
+ %c.load = load atomic i32, ptr %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr %c monotonic, align 4
+ store atomic i32 %c.add, ptr %c unordered, align 4
; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr %d monotonic, align 8
+ %d.load = load atomic i64, ptr %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr %d monotonic, align 8
+ store atomic i64 %d.add, ptr %d unordered, align 8
; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr %e monotonic, align 4
+ %e.load = load atomic float, ptr %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr %e monotonic, align 4
+ store atomic float %e.add, ptr %e unordered, align 4
; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr %e monotonic, align 8
+ %f.load = load atomic double, ptr %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr %e monotonic, align 8
+ store atomic double %f.add, ptr %e unordered, align 8
ret void
}
-; CHECK-LABEL: generic_monotonic_volatile
-define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_unordered_volatile_sys
+define void @generic_unordered_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr %a unordered, align 1
; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr %b unordered, align 2
; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr %c unordered, align 4
; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr %d unordered, align 8
; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr %e monotonic, align 4
+ store atomic volatile float %e.add, ptr %e unordered, align 4
; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr %e monotonic, align 8
+ store atomic volatile double %f.add, ptr %e unordered, align 8
ret void
}
-; CHECK-LABEL: generic_unordered
-define void @generic_unordered(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_monotonic_sys
+define void @generic_monotonic_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr %a unordered, align 1
+ %a.load = load atomic i8, ptr %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr %a unordered, align 1
+ store atomic i8 %a.add, ptr %a monotonic, align 1
; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr %b unordered, align 2
+ %b.load = load atomic i16, ptr %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr %b unordered, align 2
+ store atomic i16 %b.add, ptr %b monotonic, align 2
; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr %c unordered, align 4
+ %c.load = load atomic i32, ptr %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr %c unordered, align 4
+ store atomic i32 %c.add, ptr %c monotonic, align 4
; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr %d unordered, align 8
+ %d.load = load atomic i64, ptr %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr %d unordered, align 8
+ store atomic i64 %d.add, ptr %d monotonic, align 8
; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr %e unordered, align 4
+ store atomic float %e.add, ptr %e monotonic, align 4
; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr %e unordered, align 8
+ %f.load = load atomic double, ptr %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr %e unordered, align 8
+ store atomic double %f.add, ptr %e monotonic, align 8
ret void
}
-; CHECK-LABEL: generic_unordered_volatile
-define void @generic_unordered_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_monotonic_volatile_sys
+define void @generic_monotonic_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr %a monotonic, align 1
; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr %b monotonic, align 2
; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr %c monotonic, align 4
; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr %d monotonic, align 8
; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr %e unordered, align 4
+ store atomic volatile float %e.add, ptr %e monotonic, align 4
; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr %e unordered, align 8
+ %f.load = load atomic volatile double, ptr %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr %e unordered, align 8
+ store atomic volatile double %f.add, ptr %e monotonic, align 8
ret void
}
;; global statespace
-; CHECK-LABEL: global_plain
-define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
+; CHECK-LABEL: global_weak
+define void @global_weak(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr addrspace(1) %a
%a.add = add i8 %a.load, 1
@@ -630,222 +641,222 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs
ret void
}
-; CHECK-LABEL: global_monotonic
-define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_unordered_sys
+define void @global_unordered_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
+ %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+ store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
+ %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+ store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
+ %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+ store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
+ %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+ store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
+ %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
+ store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
+ %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
+ store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
ret void
}
-; CHECK-LABEL: global_monotonic_volatile
-define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_unordered_volatile_sys
+define void @global_unordered_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
+ store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
+ store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
ret void
}
-; CHECK-LABEL: global_unordered
-define void @global_unordered(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_monotonic_sys
+define void @global_monotonic_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1
+ %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(1) %a unordered, align 1
+ store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b unordered, align 2
+ %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b unordered, align 2
+ store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c unordered, align 4
+ %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c unordered, align 4
+ store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d unordered, align 8
+ %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d unordered, align 8
+ store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e unordered, align 4
+ store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e unordered, align 8
+ %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e unordered, align 8
+ store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
ret void
}
-; CHECK-LABEL: global_unordered_volatile
-define void @global_unordered_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+; CHECK-LABEL: global_monotonic_volatile_sys
+define void @global_monotonic_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(1) %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e unordered, align 4
+ store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e unordered, align 8
+ %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e unordered, align 8
+ store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
ret void
}
;; shared statespace
-; CHECK-LABEL: shared_plain
-define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
+; CHECK-LABEL: shared_weak
+define void @shared_weak(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr addrspace(3) %a
%a.add = add i8 %a.load, 1
@@ -1046,202 +1057,198 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs
ret void
}
-; CHECK-LABEL: shared_monotonic
-define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared.
-
+; CHECK-LABEL: shared_unordered_sys
+define void @shared_unordered_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
+ %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
+ %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+ store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
+ %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+ store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
+ %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+ store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
+ %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
+ store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
+ %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
+ store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
ret void
}
-; CHECK-LABEL: shared_monotonic_volatile
-define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+; CHECK-LABEL: shared_unordered_volatile_sys
+define void @shared_unordered_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
+ store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
+ store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
ret void
}
-; CHECK-LABEL: shared_unordered
-define void @shared_unordered(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; TODO: optimize .sys.shared to .cta.shared or .cluster.shared.
-
+; CHECK-LABEL: shared_monotonic_sys
+define void @shared_monotonic_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1
+ %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(3) %a unordered, align 1
+ store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b unordered, align 2
+ %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b unordered, align 2
+ store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c unordered, align 4
+ %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c unordered, align 4
+ store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d unordered, align 8
+ %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d unordered, align 8
+ store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e unordered, align 4
+ store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e unordered, align 8
+ %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e unordered, align 8
+ store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
ret void
}
-; CHECK-LABEL: shared_unordered_volatile
-define void @shared_unordered_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+; CHECK-LABEL: shared_monotonic_volatile_sys
+define void @shared_monotonic_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(3) %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e unordered, align 4
+ store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e unordered, align 8
+ %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e unordered, align 8
+ store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
ret void
}
;; local statespace
-; CHECK-LABEL: local_plain
-define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+; CHECK-LABEL: local_weak
+define void @local_weak(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr addrspace(5) %a
%a.add = add i8 %a.load, 1
@@ -1343,9 +1350,6 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace
; CHECK-LABEL: local_volatile
define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using volatile operations.
-
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load volatile i8, ptr addrspace(5) %a
%a.add = add i8 %a.load, 1
@@ -1445,175 +1449,166 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp
ret void
}
-; CHECK-LABEL: local_monotonic
-define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by using PTX atomic operations.
-
+; CHECK-LABEL: local_unordered_sys
+define void @local_unordered_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
+ %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
+ %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+ store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
+ %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+ store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
+ %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+ store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
+ %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
+ store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
+ %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
+ store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
ret void
}
-; CHECK-LABEL: local_monotonic_volatile
-define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
- ; TODO: generate PTX that preserves Concurrent Forward Progress
- ; by generating atomic or volatile operations
-
+; CHECK-LABEL: local_unordered_volatile_sys
+define void @local_unordered_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
+ store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
+ store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
ret void
}
-; CHECK-LABEL: local_unordered
-define void @local_unordered(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+; CHECK-LABEL: local_monotonic_sys
+define void @local_monotonic_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1
+ %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(5) %a unordered, align 1
+ store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(5) %b unordered, align 2
+ %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(5) %b unordered, align 2
+ store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(5) %c unordered, align 4
+ %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(5) %c unordered, align 4
+ store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(5) %d unordered, align 8
+ %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(5) %d unordered, align 8
+ store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(5) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(5) %e unordered, align 4
+ store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(5) %e unordered, align 8
+ %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(5) %e unordered, align 8
+ store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
ret void
}
-; CHECK-LABEL: local_unordered_volatile
-define void @local_unordered_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+; CHECK-LABEL: local_monotonic_volatile
+define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(5) %a unordered, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(5) %b unordered, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(5) %b unordered, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(5) %c unordered, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(5) %c unordered, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(5) %d unordered, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(5) %d unordered, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(5) %e unordered, align 4
- %e.add = fadd float %e.load, 1.0
+ %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(5) %e unordered, align 4
+ store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(5) %e unordered, align 8
+ %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(5) %e unordered, align 8
+ store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
ret void
}
-
-; TODO: add plain,atomic,volatile,atomic volatile tests
-; for .const and .param statespaces
\ No newline at end of file
More information about the llvm-commits
mailing list