[llvm] [NVPTX] Add Volta Atomic SequentiallyConsistent Load and Store Operations (PR #98551)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 11 14:10:22 PDT 2024
https://github.com/gonzalobg created https://github.com/llvm/llvm-project/pull/98551
This PR Builds on #98022 .
In this PR, the `fence.sc.sys` is inserted as part of the load/store instruction.
The next PR will refactor a few things, add fence instruction support, and split this out.
>From dd737092ead2417e63aaff5614ddc0680e680d67 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Sun, 23 Jun 2024 19:29:25 +0200
Subject: [PATCH 1/7] [NVPTX] Adds float/double tests to load-store tests.
---
llvm/test/CodeGen/NVPTX/load-store.ll | 30 +++++++++++++++++++++++++++
1 file changed, 30 insertions(+)
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index 0955b433e0f76..c477bd9e744cd 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -27,6 +27,18 @@ define void @plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
store i64 %d.add, ptr %d
+ ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load float, ptr %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store float %e.add, ptr %c
+
+ ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load double, ptr %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store double %f.add, ptr %c
+
ret void
}
@@ -56,6 +68,18 @@ define void @volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
store volatile i64 %d.add, ptr %d
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load volatile float, ptr %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store volatile float %e.add, ptr %c
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load volatile double, ptr %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store volatile double %f.add, ptr %c
+
ret void
}
@@ -91,5 +115,11 @@ define void @monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_add
; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr %e monotonic, align 4
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e monotonic, align 8
+
ret void
}
>From ec2ce73be799cead0b389eed5fdbba020601c5d0 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Tue, 9 Jul 2024 18:17:44 +0200
Subject: [PATCH 2/7] [NVPTX] Volta Relaxed/Acquire/Release and Volatile
Load/Store Ops
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 28 +-
llvm/lib/Target/NVPTX/NVPTX.h | 8 +
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 301 ++++--
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 144 +--
llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 951 ++++++++++++++++++
llvm/test/CodeGen/NVPTX/load-store.ll | 553 +++++++++-
6 files changed, 1829 insertions(+), 156 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index b7a20c351f5ff..be0ca3b6e66ed 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -224,9 +224,33 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
if (Modifier) {
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int) MO.getImm();
- if (!strcmp(Modifier, "volatile")) {
- if (Imm)
+ if (!strcmp(Modifier, "sem")) {
+ switch (Imm) {
+ case NVPTX::PTXLdStInstCode::NotAtomic:
+ break;
+ case NVPTX::PTXLdStInstCode::Volatile:
O << ".volatile";
+ break;
+ case NVPTX::PTXLdStInstCode::Relaxed:
+ O << ".relaxed.sys";
+ break;
+ case NVPTX::PTXLdStInstCode::Acquire:
+ O << ".acquire.sys";
+ break;
+ case NVPTX::PTXLdStInstCode::Release:
+ O << ".release.sys";
+ break;
+ case NVPTX::PTXLdStInstCode::RelaxedMMIO:
+ O << ".mmio.relaxed.sys";
+ break;
+ default:
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "NVPTX LdStCode Printer does not support \"" << Imm
+ << "\" sem modifier.";
+ report_fatal_error(OS.str());
+ break;
+ }
} else if (!strcmp(Modifier, "addsp")) {
switch (Imm) {
case NVPTX::PTXLdStInstCode::GLOBAL:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index b0cb24c63c3ce..3c7167b157025 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -107,6 +107,14 @@ enum LoadStore {
};
namespace PTXLdStInstCode {
+enum MemorySemantic {
+ NotAtomic = 0, // PTX calls these: "Weak"
+ Volatile = 1,
+ Relaxed = 2,
+ Acquire = 3,
+ Release = 4,
+ RelaxedMMIO = 5
+};
enum AddressSpace {
GENERIC = 0,
GLOBAL = 1,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 1e1cbb15e33d4..2d0c1edb91152 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -700,6 +700,157 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
return NVPTX::PTXLdStInstCode::GENERIC;
}
+static unsigned int getCodeMemorySemantic(MemSDNode *N,
+ const NVPTXSubtarget *Subtarget) {
+ AtomicOrdering Ordering = N->getSuccessOrdering();
+ auto CodeAddrSpace = getCodeAddrSpace(N);
+
+ // Supports relaxed, acquire, release, weak:
+ bool hasAtomics =
+ Subtarget->getPTXVersion() >= 60 && Subtarget->getSmVersion() >= 70;
+ // Supports mmio:
+ bool hasRelaxedMMIO =
+ Subtarget->getPTXVersion() >= 82 && Subtarget->getSmVersion() >= 70;
+
+ // TODO: lowering for SequentiallyConsistent Operations: for now, we error.
+ // TODO: lowering for AcquireRelease Operations: for now, we error.
+ //
+
+ // clang-format off
+
+ // Lowering for non-SequentiallyConsistent Operations
+ //
+ // | Atomic | Volatile | Statespace | Lowering sm_60- | Lowering sm_70+ |
+ // |---------|----------|-------------------------------|-----------------|------------------------------------------------------|
+ // | No | No | All | plain | .weak |
+ // | No | Yes | Generic / Shared / Global [0] | .volatile | .volatile |
+ // | No | Yes | Local / Const / Param | plain [1] | .weak [1] |
+ // | Relaxed | No | Generic / Shared / Global [0] | .volatile | <atomic sem> |
+ // | Other | No | Generic / Shared / Global [0] | Error [2] | <atomic sem> |
+ // | Yes | No | Local / Const / Param | plain [1] | .weak [1] |
+ // | Relaxed | Yes | Generic / Shared [0] | .volatile | .volatile |
+ // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) or .volatile (PTX 8.1-) |
+ // | Relaxed | Yes | Local / Const / Param | plain [1] | .weak [1] |
+ // | Other | Yes | Generic / Shared / Global [0] | Error [4] | <atomic sem> [3] |
+
+ // clang-format on
+
+ // [0]: volatile and atomics are only supported on generic addressing to
+ // shared or global, or shared, or global.
+ // MMIO requires generic addressing to global or global, but
+ // (TODO) we only implement it for global.
+
+ // [1]: TODO: this implementation exhibits PTX Undefined Behavior; it
+ // fails to preserve the side-effects of atomics and volatile
+ // accesses in LLVM IR to local / const / param, causing
+ // well-formed LLVM-IR & CUDA C++ programs to be miscompiled
+ // in sm_70+.
+
+ if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
+ CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
+ CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
+ return NVPTX::PTXLdStInstCode::NotAtomic;
+ }
+
+ // [2]: Atomics with Ordering different than Relaxed are not supported on
+ // sm_60 and older.
+ if (!(Ordering == AtomicOrdering::NotAtomic ||
+ Ordering == AtomicOrdering::Monotonic) &&
+ !hasAtomics) {
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "PTX does not support \"atomic\" for orderings different than"
+ "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \""
+ << toIRString(Ordering) << "\".";
+ report_fatal_error(OS.str());
+ }
+
+ // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
+ // the volatile semantics and preserve the atomic ones. [4]: TODO: volatile
+ // atomics with order stronger than relaxed are currently unimplemented in
+ // sm_60 and older.
+ if (!hasAtomics && N->isVolatile() &&
+ !(Ordering == AtomicOrdering::NotAtomic ||
+ Ordering == AtomicOrdering::Monotonic)) {
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "PTX does not support \"volatile atomic\" for orderings different "
+ "than \"NotAtomic\" or \"Monotonic\" for sm_60 and older, but order "
+ "is: \""
+ << toIRString(Ordering) << "\".";
+ report_fatal_error(OS.str());
+ }
+
+ // PTX volatile and PTX atomics are not available for statespace that differ
+ // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
+ // atomics is undefined if the generic address does not refer to a .global or
+ // .shared memory location.
+ bool addrGenericOrGlobalOrShared =
+ (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
+ CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
+ CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
+ bool useRelaxedMMIO =
+ hasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
+
+ switch (Ordering) {
+ case AtomicOrdering::NotAtomic:
+ return N->isVolatile() && addrGenericOrGlobalOrShared
+ ? NVPTX::PTXLdStInstCode::Volatile
+ : NVPTX::PTXLdStInstCode::NotAtomic;
+ case AtomicOrdering::Monotonic:
+ if (N->isVolatile())
+ return useRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO
+ : addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
+ : NVPTX::PTXLdStInstCode::NotAtomic;
+ else
+ return hasAtomics ? NVPTX::PTXLdStInstCode::Relaxed
+ : addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
+ : NVPTX::PTXLdStInstCode::NotAtomic;
+ case AtomicOrdering::Acquire:
+ if (!N->readMem()) {
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "PTX only supports Acquire Ordering on reads: "
+ << N->getOperationName();
+ N->print(OS);
+ report_fatal_error(OS.str());
+ }
+ return addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire
+ : NVPTX::PTXLdStInstCode::NotAtomic;
+ case AtomicOrdering::Release:
+ if (!N->writeMem()) {
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "PTX only supports Release Ordering on writes: "
+ << N->getOperationName();
+ N->print(OS);
+ report_fatal_error(OS.str());
+ }
+ return addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release
+ : NVPTX::PTXLdStInstCode::NotAtomic;
+ case AtomicOrdering::AcquireRelease: {
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "PTX only supports AcquireRelease Ordering on read-modify-write: "
+ << N->getOperationName();
+ N->print(OS);
+ report_fatal_error(OS.str());
+ }
+ case AtomicOrdering::SequentiallyConsistent:
+ case AtomicOrdering::Unordered:
+ default: {
+ // TODO: support AcquireRelease and SequentiallyConsistent
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "NVPTX backend does not support AtomicOrdering \""
+ << toIRString(Ordering) << "\" yet.";
+ report_fatal_error(OS.str());
+ }
+ }
+
+ report_fatal_error("unreachable");
+}
+
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
unsigned CodeAddrSpace, MachineFunction *F) {
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -902,32 +1053,18 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
if (!LoadedVT.isSimple())
return false;
- AtomicOrdering Ordering = LD->getSuccessOrdering();
- // In order to lower atomic loads with stronger guarantees we would need to
- // use load.acquire or insert fences. However these features were only added
- // with PTX ISA 6.0 / sm_70.
- // TODO: Check if we can actually use the new instructions and implement them.
- if (isStrongerThanMonotonic(Ordering))
- return false;
-
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
return tryLDGLDU(N);
}
+ // Memory Semantic Setting
+ unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget);
+
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
- // Volatile Setting
- // - .volatile is only available for .global and .shared
- // - .volatile has the same memory synchronization semantics as .relaxed.sys
- bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
- if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
- isVolatile = false;
-
// Type Setting: fromType + fromTypeWidth
//
// Sign : ISD::SEXTLOAD
@@ -968,9 +1105,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl), getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl), Addr, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl),
+ Addr,
+ Chain};
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
} else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
: SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
@@ -979,9 +1120,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl), getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl),
+ Base,
+ Offset,
+ Chain};
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
} else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
: SelectADDRri(N1.getNode(), N1, Base, Offset)) {
@@ -996,9 +1142,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl), getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl),
+ Base,
+ Offset,
+ Chain};
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
} else {
if (PointerSize == 64)
@@ -1012,9 +1163,13 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl), getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl), N1, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl),
+ getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl),
+ N1,
+ Chain};
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
}
@@ -1051,13 +1206,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
- // Volatile Setting
- // - .volatile is only availalble for .global and .shared
- bool IsVolatile = MemSD->isVolatile();
- if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
- IsVolatile = false;
+ // Memory Semantic Setting
+ unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
// Vector Setting
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1124,9 +1274,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL), Addr, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL),
+ getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL),
+ Addr,
+ Chain};
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
} else if (PointerSize == 64
? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
@@ -1149,9 +1303,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL),
+ getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL),
+ Base,
+ Offset,
+ Chain};
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
} else if (PointerSize == 64
? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
@@ -1194,9 +1353,14 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL),
+ getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL),
+ Base,
+ Offset,
+ Chain};
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
} else {
@@ -1239,9 +1403,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL), getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL), Op1, Chain };
+ SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL),
+ getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL),
+ Op1,
+ Chain};
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
}
@@ -1684,27 +1852,13 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!StoreVT.isSimple())
return false;
- AtomicOrdering Ordering = ST->getSuccessOrdering();
- // In order to lower atomic loads with stronger guarantees we would need to
- // use store.release or insert fences. However these features were only added
- // with PTX ISA 6.0 / sm_70.
- // TODO: Check if we can actually use the new instructions and implement them.
- if (isStrongerThanMonotonic(Ordering))
- return false;
-
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
- // Volatile Setting
- // - .volatile is only available for .global and .shared
- // - .volatile has the same memory synchronization semantics as .relaxed.sys
- bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
- if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
- isVolatile = false;
+ // Memory Semantic Setting
+ unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget);
// Vector Setting
MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1741,7 +1895,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(isVolatile, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -1758,7 +1912,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(isVolatile, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -1783,7 +1937,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
return false;
SDValue Ops[] = {Value,
- getI32Imm(isVolatile, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -1805,7 +1959,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(isVolatile, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -1844,13 +1998,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
- // Volatile Setting
- // - .volatile is only availalble for .global and .shared
- bool IsVolatile = MemSD->isVolatile();
- if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
- CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
- IsVolatile = false;
+ // Memory Semantic Setting
+ unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
@@ -1892,7 +2041,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
ToTypeWidth = 32;
}
- StOps.push_back(getI32Imm(IsVolatile, DL));
+ StOps.push_back(getI32Imm(CodeMemorySem, DL));
StOps.push_back(getI32Imm(CodeAddrSpace, DL));
StOps.push_back(getI32Imm(VecType, DL));
StOps.push_back(getI32Imm(ToType, DL));
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index c4c35a1f74ba9..9be8fe5d4e6ef 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2939,39 +2939,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
multiclass LD<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _ari : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _ari_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _asi : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
}
@@ -2987,39 +2987,39 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _ari : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _asi : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
}
@@ -3038,75 +3038,75 @@ let mayStore=1, hasSideEffects=0 in {
multiclass LD_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v4_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
}
let mayLoad=1, hasSideEffects=0 in {
@@ -3121,84 +3121,84 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_ari : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_asi : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$isVol, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v4_avar : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_asi : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$isVol, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "st${isVol:volatile}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}"
"$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
}
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
new file mode 100644
index 0000000000000..2ae71bf1230e2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -0,0 +1,951 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify %}
+
+; CHECK-LABEL: generic_plain
+define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+ ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load i8, ptr %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i8 %a.add, ptr %a
+
+ ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load i16, ptr %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i16 %b.add, ptr %b
+
+ ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load i32, ptr %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store i32 %c.add, ptr %c
+
+ ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load i64, ptr %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store i64 %d.add, ptr %d
+
+ ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load float, ptr %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store float %e.add, ptr %c
+
+ ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load double, ptr %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store double %f.add, ptr %c
+
+ ret void
+}
+
+; CHECK-LABEL: generic_volatile
+define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load volatile i8, ptr %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i8 %a.add, ptr %a
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load volatile i16, ptr %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i16 %b.add, ptr %b
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load volatile i32, ptr %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile i32 %c.add, ptr %c
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load volatile i64, ptr %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store volatile i64 %d.add, ptr %d
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load volatile float, ptr %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store volatile float %e.add, ptr %c
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load volatile double, ptr %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store volatile double %f.add, ptr %c
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic
+define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a monotonic, align 1
+
+ ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b monotonic, align 2
+
+ ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c monotonic, align 4
+
+ ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d monotonic, align 8
+
+ ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e monotonic, align 4
+
+ ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_acq_rel
+define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a release, align 1
+
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b release, align 2
+
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c release, align 4
+
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d release, align 8
+
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e acquire, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e release, align 4
+
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: generic_monotonic_volatile
+define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d monotonic, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e monotonic, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e monotonic, align 8
+
+ ret void
+}
+
+;; global statespace
+
+; CHECK-LABEL: global_plain
+define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
+ ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load i8, ptr addrspace(1) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i8 %a.add, ptr addrspace(1) %a
+
+ ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load i16, ptr addrspace(1) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i16 %b.add, ptr addrspace(1) %b
+
+ ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load i32, ptr addrspace(1) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store i32 %c.add, ptr addrspace(1) %c
+
+ ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load i64, ptr addrspace(1) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store i64 %d.add, ptr addrspace(1) %d
+
+ ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load float, ptr addrspace(1) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store float %e.add, ptr addrspace(1) %c
+
+ ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load double, ptr addrspace(1) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store double %f.add, ptr addrspace(1) %c
+
+ ret void
+}
+
+; CHECK-LABEL: global_volatile
+define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
+ ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load volatile i8, ptr addrspace(1) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i8 %a.add, ptr addrspace(1) %a
+
+ ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load volatile i16, ptr addrspace(1) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i16 %b.add, ptr addrspace(1) %b
+
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load volatile i32, ptr addrspace(1) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile i32 %c.add, ptr addrspace(1) %c
+
+ ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load volatile i64, ptr addrspace(1) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store volatile i64 %d.add, ptr addrspace(1) %d
+
+ ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load volatile float, ptr addrspace(1) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store volatile float %e.add, ptr addrspace(1) %c
+
+ ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load volatile double, ptr addrspace(1) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store volatile double %f.add, ptr addrspace(1) %c
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic
+define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+
+ ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+
+ ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+
+ ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+
+ ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
+
+ ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_volatile
+define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
+
+ ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel
+define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_acq_rel_volatile
+define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8
+
+ ret void
+}
+
+;; shared statespace
+
+; CHECK-LABEL: shared_plain
+define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
+ ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load i8, ptr addrspace(3) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i8 %a.add, ptr addrspace(3) %a
+
+ ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load i16, ptr addrspace(3) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i16 %b.add, ptr addrspace(3) %b
+
+ ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load i32, ptr addrspace(3) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store i32 %c.add, ptr addrspace(3) %c
+
+ ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load i64, ptr addrspace(3) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store i64 %d.add, ptr addrspace(3) %d
+
+ ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load float, ptr addrspace(3) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store float %e.add, ptr addrspace(3) %c
+
+ ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load double, ptr addrspace(3) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store double %f.add, ptr addrspace(3) %c
+
+ ret void
+}
+
+; CHECK-LABEL: shared_volatile
+define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load volatile i8, ptr addrspace(3) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i8 %a.add, ptr addrspace(3) %a
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load volatile i16, ptr addrspace(3) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i16 %b.add, ptr addrspace(3) %b
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load volatile i32, ptr addrspace(3) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile i32 %c.add, ptr addrspace(3) %c
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load volatile i64, ptr addrspace(3) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store volatile i64 %d.add, ptr addrspace(3) %d
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load volatile float, ptr addrspace(3) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store volatile float %e.add, ptr addrspace(3) %c
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load volatile double, ptr addrspace(3) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store volatile double %f.add, ptr addrspace(3) %c
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic
+define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+
+ ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+
+ ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+
+ ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+
+ ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
+
+ ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile
+define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel
+define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_acq_rel_volatile
+define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1
+
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2
+
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8
+
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4
+
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8
+
+ ret void
+}
+
+;; local statespace
+
+; CHECK-LABEL: local_plain
+define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load i8, ptr addrspace(5) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i8 %a.add, ptr addrspace(5) %a
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load i16, ptr addrspace(5) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i16 %b.add, ptr addrspace(5) %b
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load i32, ptr addrspace(5) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store i32 %c.add, ptr addrspace(5) %c
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load i64, ptr addrspace(5) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store i64 %d.add, ptr addrspace(5) %d
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load float, ptr addrspace(5) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store float %e.add, ptr addrspace(5) %c
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load double, ptr addrspace(5) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store double %f.add, ptr addrspace(5) %c
+
+ ret void
+}
+
+; CHECK-LABEL: local_volatile
+define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load volatile i8, ptr addrspace(5) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i8 %a.add, ptr addrspace(5) %a
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load volatile i16, ptr addrspace(5) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i16 %b.add, ptr addrspace(5) %b
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load volatile i32, ptr addrspace(5) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile i32 %c.add, ptr addrspace(5) %c
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load volatile i64, ptr addrspace(5) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store volatile i64 %d.add, ptr addrspace(5) %d
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load volatile float, ptr addrspace(5) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store volatile float %e.add, ptr addrspace(5) %c
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load volatile double, ptr addrspace(5) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store volatile double %f.add, ptr addrspace(5) %c
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic
+define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile
+define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel
+define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e release, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_acq_rel_volatile
+define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8
+
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index c477bd9e744cd..27065f5eca9f4 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -1,8 +1,10 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; CHECK-LABEL: plain
-define void @plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+; generic statespace
+
+; CHECK-LABEL: generic_plain
+define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load i8, ptr %a
%a.add = add i8 %a.load, 1
@@ -42,8 +44,8 @@ define void @plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
ret void
}
-; CHECK-LABEL: volatile
-define void @volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
+; CHECK-LABEL: generic_volatile
+define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load volatile i8, ptr %a
%a.add = add i8 %a.load, 1
@@ -83,8 +85,8 @@ define void @volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
ret void
}
-; CHECK-LABEL: monotonic
-define void @monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_monotonic
+define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr %a monotonic, align 1
%a.add = add i8 %a.load, 1
@@ -123,3 +125,542 @@ define void @monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_add
ret void
}
+
+; CHECK-LABEL: generic_monotonic_volatile
+define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a monotonic, align 1
+
+ ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b monotonic, align 2
+
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c monotonic, align 4
+
+ ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d monotonic, align 8
+
+ ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e monotonic, align 4
+
+ ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e monotonic, align 8
+
+ ret void
+}
+
+;; global statespace
+
+; CHECK-LABEL: global_plain
+define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
+ ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load i8, ptr addrspace(1) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i8 %a.add, ptr addrspace(1) %a
+
+ ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load i16, ptr addrspace(1) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i16 %b.add, ptr addrspace(1) %b
+
+ ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load i32, ptr addrspace(1) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store i32 %c.add, ptr addrspace(1) %c
+
+ ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load i64, ptr addrspace(1) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store i64 %d.add, ptr addrspace(1) %d
+
+ ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load float, ptr addrspace(1) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store float %e.add, ptr addrspace(1) %c
+
+ ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load double, ptr addrspace(1) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store double %f.add, ptr addrspace(1) %c
+
+ ret void
+}
+
+; CHECK-LABEL: global_volatile
+define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
+ ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load volatile i8, ptr addrspace(1) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i8 %a.add, ptr addrspace(1) %a
+
+ ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load volatile i16, ptr addrspace(1) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i16 %b.add, ptr addrspace(1) %b
+
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load volatile i32, ptr addrspace(1) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile i32 %c.add, ptr addrspace(1) %c
+
+ ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load volatile i64, ptr addrspace(1) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store volatile i64 %d.add, ptr addrspace(1) %d
+
+ ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load volatile float, ptr addrspace(1) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store volatile float %e.add, ptr addrspace(1) %c
+
+ ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load volatile double, ptr addrspace(1) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store volatile double %f.add, ptr addrspace(1) %c
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic
+define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+
+ ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+
+ ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+
+ ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
+
+ ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: global_monotonic_volatile
+define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+
+ ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+
+ ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+
+ ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
+
+ ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
+
+ ret void
+}
+
+;; shared statespace
+
+; CHECK-LABEL: shared_plain
+define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
+ ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load i8, ptr addrspace(3) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i8 %a.add, ptr addrspace(3) %a
+
+ ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load i16, ptr addrspace(3) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i16 %b.add, ptr addrspace(3) %b
+
+ ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load i32, ptr addrspace(3) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store i32 %c.add, ptr addrspace(3) %c
+
+ ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load i64, ptr addrspace(3) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store i64 %d.add, ptr addrspace(3) %d
+
+ ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load float, ptr addrspace(3) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store float %e.add, ptr addrspace(3) %c
+
+ ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load double, ptr addrspace(3) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store double %f.add, ptr addrspace(3) %c
+
+ ret void
+}
+
+; CHECK-LABEL: shared_volatile
+define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load volatile i8, ptr addrspace(3) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i8 %a.add, ptr addrspace(3) %a
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load volatile i16, ptr addrspace(3) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i16 %b.add, ptr addrspace(3) %b
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load volatile i32, ptr addrspace(3) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile i32 %c.add, ptr addrspace(3) %c
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load volatile i64, ptr addrspace(3) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store volatile i64 %d.add, ptr addrspace(3) %d
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load volatile float, ptr addrspace(3) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store volatile float %e.add, ptr addrspace(3) %c
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load volatile double, ptr addrspace(3) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store volatile double %f.add, ptr addrspace(3) %c
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic
+define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: shared_monotonic_volatile
+define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+
+ ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+
+ ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
+
+ ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
+
+ ret void
+}
+
+;; local statespace
+
+; CHECK-LABEL: local_plain
+define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load i8, ptr addrspace(5) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i8 %a.add, ptr addrspace(5) %a
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load i16, ptr addrspace(5) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store i16 %b.add, ptr addrspace(5) %b
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load i32, ptr addrspace(5) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store i32 %c.add, ptr addrspace(5) %c
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load i64, ptr addrspace(5) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store i64 %d.add, ptr addrspace(5) %d
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load float, ptr addrspace(5) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store float %e.add, ptr addrspace(5) %c
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load double, ptr addrspace(5) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store double %f.add, ptr addrspace(5) %c
+
+ ret void
+}
+
+; CHECK-LABEL: local_volatile
+define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load volatile i8, ptr addrspace(5) %a
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i8 %a.add, ptr addrspace(5) %a
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load volatile i16, ptr addrspace(5) %b
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store volatile i16 %b.add, ptr addrspace(5) %b
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load volatile i32, ptr addrspace(5) %c
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile i32 %c.add, ptr addrspace(5) %c
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load volatile i64, ptr addrspace(5) %d
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store volatile i64 %d.add, ptr addrspace(5) %d
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load volatile float, ptr addrspace(5) %c
+ %e.add = fadd float %e.load, 1.
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store volatile float %e.add, ptr addrspace(5) %c
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load volatile double, ptr addrspace(5) %c
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store volatile double %f.add, ptr addrspace(5) %c
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic
+define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
+
+ ret void
+}
+
+; CHECK-LABEL: local_monotonic_volatile
+define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+
+ ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+
+ ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+
+ ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
+
+ ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
+
+ ret void
+}
>From c29b1aa2cf987d1e414ac6d25a472fec395b6133 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Thu, 11 Jul 2024 19:10:57 +0200
Subject: [PATCH 3/7] [NVPTX] Bugfix: fix ODR issues in NVPTXSubtarget
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 16 +++----
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 53 +++++++++++++--------
2 files changed, 39 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 2d0c1edb91152..b661e68afcecc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -705,12 +705,8 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
AtomicOrdering Ordering = N->getSuccessOrdering();
auto CodeAddrSpace = getCodeAddrSpace(N);
- // Supports relaxed, acquire, release, weak:
- bool hasAtomics =
- Subtarget->getPTXVersion() >= 60 && Subtarget->getSmVersion() >= 70;
- // Supports mmio:
- bool hasRelaxedMMIO =
- Subtarget->getPTXVersion() >= 82 && Subtarget->getSmVersion() >= 70;
+ bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
+ bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
// TODO: lowering for SequentiallyConsistent Operations: for now, we error.
// TODO: lowering for AcquireRelease Operations: for now, we error.
@@ -756,7 +752,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
// sm_60 and older.
if (!(Ordering == AtomicOrdering::NotAtomic ||
Ordering == AtomicOrdering::Monotonic) &&
- !hasAtomics) {
+ !HasMemoryOrdering) {
SmallString<256> Msg;
raw_svector_ostream OS(Msg);
OS << "PTX does not support \"atomic\" for orderings different than"
@@ -769,7 +765,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
// the volatile semantics and preserve the atomic ones. [4]: TODO: volatile
// atomics with order stronger than relaxed are currently unimplemented in
// sm_60 and older.
- if (!hasAtomics && N->isVolatile() &&
+ if (!HasMemoryOrdering && N->isVolatile() &&
!(Ordering == AtomicOrdering::NotAtomic ||
Ordering == AtomicOrdering::Monotonic)) {
SmallString<256> Msg;
@@ -790,7 +786,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
bool useRelaxedMMIO =
- hasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
+ HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
switch (Ordering) {
case AtomicOrdering::NotAtomic:
@@ -803,7 +799,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
: addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
: NVPTX::PTXLdStInstCode::NotAtomic;
else
- return hasAtomics ? NVPTX::PTXLdStInstCode::Relaxed
+ return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed
: addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
: NVPTX::PTXLdStInstCode::NotAtomic;
case AtomicOrdering::Acquire:
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 3ca4c1a24c79a..6f7ac23069a5a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -59,42 +59,55 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
NVPTXSubtarget(const Triple &TT, const std::string &CPU,
const std::string &FS, const NVPTXTargetMachine &TM);
- const TargetFrameLowering *getFrameLowering() const override {
+ inline const TargetFrameLowering *getFrameLowering() const override {
return &FrameLowering;
}
- const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
- const NVPTXRegisterInfo *getRegisterInfo() const override {
+ inline const NVPTXInstrInfo *getInstrInfo() const override {
+ return &InstrInfo;
+ }
+ inline const NVPTXRegisterInfo *getRegisterInfo() const override {
return &InstrInfo.getRegisterInfo();
}
- const NVPTXTargetLowering *getTargetLowering() const override {
+ inline const NVPTXTargetLowering *getTargetLowering() const override {
return &TLInfo;
}
- const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
+ inline const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
return &TSInfo;
}
- bool hasAtomAddF64() const { return SmVersion >= 60; }
- bool hasAtomScope() const { return SmVersion >= 60; }
- bool hasAtomBitwise64() const { return SmVersion >= 32; }
- bool hasAtomMinMax64() const { return SmVersion >= 32; }
- bool hasLDG() const { return SmVersion >= 32; }
+ inline bool hasAtomAddF64() const { return SmVersion >= 60; }
+ inline bool hasAtomScope() const { return SmVersion >= 60; }
+ inline bool hasAtomBitwise64() const { return SmVersion >= 32; }
+ inline bool hasAtomMinMax64() const { return SmVersion >= 32; }
+ inline bool hasLDG() const { return SmVersion >= 32; }
inline bool hasHWROT32() const { return SmVersion >= 32; }
bool hasImageHandles() const;
- bool hasFP16Math() const { return SmVersion >= 53; }
- bool hasBF16Math() const { return SmVersion >= 80; }
+ inline bool hasFP16Math() const { return SmVersion >= 53; }
+ inline bool hasBF16Math() const { return SmVersion >= 80; }
bool allowFP16Math() const;
- bool hasMaskOperator() const { return PTXVersion >= 71; }
- bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
- unsigned int getFullSmVersion() const { return FullSmVersion; }
- unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
+ inline bool hasMaskOperator() const { return PTXVersion >= 71; }
+ inline bool hasNoReturn() const {
+ return SmVersion >= 30 && PTXVersion >= 64;
+ }
+ // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
+ // release, acq_rel, sc) ?
+ inline bool hasMemoryOrdering() const {
+ return SmVersion >= 70 && PTXVersion >= 60;
+ }
+ // Does SM & PTX support atomic relaxed MMIO operations ?
+ inline bool hasRelaxedMMIO() const {
+ return SmVersion >= 70 && PTXVersion >= 82;
+ }
+ inline unsigned int getFullSmVersion() const { return FullSmVersion; }
+ inline unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
// GPUs with "a" suffix have include architecture-accelerated features that
// are supported on the specified architecture only, hence such targets do not
// follow the onion layer model. hasAAFeatures() allows distinguishing such
// GPU variants from the base GPU architecture.
// - 0 represents base GPU model,
// - non-zero value identifies particular architecture-accelerated variant.
- bool hasAAFeatures() const { return getFullSmVersion() % 10; }
- std::string getTargetName() const { return TargetName; }
+ inline bool hasAAFeatures() const { return getFullSmVersion() % 10; }
+ inline std::string getTargetName() const { return TargetName; }
// Get maximum value of required alignments among the supported data types.
// From the PTX ISA doc, section 8.2.3:
@@ -103,9 +116,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// of 64 bits. Memory operations with a vector data-type are modelled as a
// set of equivalent memory operations with a scalar data-type, executed in
// an unspecified order on the elements in the vector.
- unsigned getMaxRequiredAlignment() const { return 8; }
+ inline unsigned getMaxRequiredAlignment() const { return 8; }
- unsigned getPTXVersion() const { return PTXVersion; }
+ inline unsigned getPTXVersion() const { return PTXVersion; }
NVPTXSubtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
>From d163bc5b0f14cc917afe4e7b65f927eea582e7cb Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Thu, 11 Jul 2024 18:04:03 +0200
Subject: [PATCH 4/7] [NVPTX]: Remove redundant check and fix capitalization
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 35 +++++++--------------
1 file changed, 11 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index b661e68afcecc..68f215943a369 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -727,7 +727,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
// | Relaxed | Yes | Generic / Shared [0] | .volatile | .volatile |
// | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) or .volatile (PTX 8.1-) |
// | Relaxed | Yes | Local / Const / Param | plain [1] | .weak [1] |
- // | Other | Yes | Generic / Shared / Global [0] | Error [4] | <atomic sem> [3] |
+ // | Other | Yes | Generic / Shared / Global [0] | Error [2] | <atomic sem> [3] |
// clang-format on
@@ -749,7 +749,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
}
// [2]: Atomics with Ordering different than Relaxed are not supported on
- // sm_60 and older.
+ // sm_60 and older; this includes volatile atomics.
if (!(Ordering == AtomicOrdering::NotAtomic ||
Ordering == AtomicOrdering::Monotonic) &&
!HasMemoryOrdering) {
@@ -762,45 +762,32 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
}
// [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
- // the volatile semantics and preserve the atomic ones. [4]: TODO: volatile
- // atomics with order stronger than relaxed are currently unimplemented in
- // sm_60 and older.
- if (!HasMemoryOrdering && N->isVolatile() &&
- !(Ordering == AtomicOrdering::NotAtomic ||
- Ordering == AtomicOrdering::Monotonic)) {
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "PTX does not support \"volatile atomic\" for orderings different "
- "than \"NotAtomic\" or \"Monotonic\" for sm_60 and older, but order "
- "is: \""
- << toIRString(Ordering) << "\".";
- report_fatal_error(OS.str());
- }
+ // the volatile semantics and preserve the atomic ones.
// PTX volatile and PTX atomics are not available for statespace that differ
// from .generic, .global, or .shared. The behavior of PTX volatile and PTX
// atomics is undefined if the generic address does not refer to a .global or
// .shared memory location.
- bool addrGenericOrGlobalOrShared =
+ bool AddrGenericOrGlobalOrShared =
(CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
- bool useRelaxedMMIO =
+ bool UseRelaxedMMIO =
HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
switch (Ordering) {
case AtomicOrdering::NotAtomic:
- return N->isVolatile() && addrGenericOrGlobalOrShared
+ return N->isVolatile() && AddrGenericOrGlobalOrShared
? NVPTX::PTXLdStInstCode::Volatile
: NVPTX::PTXLdStInstCode::NotAtomic;
case AtomicOrdering::Monotonic:
if (N->isVolatile())
- return useRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO
- : addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
+ return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO
+ : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
: NVPTX::PTXLdStInstCode::NotAtomic;
else
return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed
- : addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
+ : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
: NVPTX::PTXLdStInstCode::NotAtomic;
case AtomicOrdering::Acquire:
if (!N->readMem()) {
@@ -811,7 +798,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
N->print(OS);
report_fatal_error(OS.str());
}
- return addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire
+ return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire
: NVPTX::PTXLdStInstCode::NotAtomic;
case AtomicOrdering::Release:
if (!N->writeMem()) {
@@ -822,7 +809,7 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
N->print(OS);
report_fatal_error(OS.str());
}
- return addrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release
+ return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release
: NVPTX::PTXLdStInstCode::NotAtomic;
case AtomicOrdering::AcquireRelease: {
SmallString<256> Msg;
>From af625238e21833ba1ca54cab84e9acc433ca099f Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Thu, 11 Jul 2024 14:45:46 +0200
Subject: [PATCH 5/7] [NVPTX] Volta SeqCst Load/Store Ops
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 10 ++
llvm/lib/Target/NVPTX/NVPTX.h | 3 +-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 86 ++++++++---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 144 +++++++++---------
llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 55 +++++++
5 files changed, 205 insertions(+), 93 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index be0ca3b6e66ed..6f6f086f1fe95 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -251,6 +251,16 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
report_fatal_error(OS.str());
break;
}
+ } else if (!strcmp(Modifier, "sc")) {
+ switch (Imm) {
+ // TODO: refactor fence insertion in ISelDagToDag instead of here
+ // as part of implementing atomicrmw seq_cst.
+ case NVPTX::PTXLdStInstCode::SeqCstFence:
+ O << "fence.sc.sys;\n\t";
+ break;
+ default:
+ break;
+ }
} else if (!strcmp(Modifier, "addsp")) {
switch (Imm) {
case NVPTX::PTXLdStInstCode::GLOBAL:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 3c7167b157025..d6a4471e2bc33 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -113,7 +113,8 @@ enum MemorySemantic {
Relaxed = 2,
Acquire = 3,
Release = 4,
- RelaxedMMIO = 5
+ RelaxedMMIO = 5,
+ SeqCstFence = 6,
};
enum AddressSpace {
GENERIC = 0,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 68f215943a369..64e08730af4d7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -700,21 +700,24 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
return NVPTX::PTXLdStInstCode::GENERIC;
}
-static unsigned int getCodeMemorySemantic(MemSDNode *N,
- const NVPTXSubtarget *Subtarget) {
+struct MemorySemantic {
+ unsigned int sem = -1;
+ unsigned int sc_fence = -1;
+ MemorySemantic(unsigned int s) : sem(s) {}
+ MemorySemantic(unsigned int s, unsigned int f) : sem(s), sc_fence(f) {}
+};
+
+static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
+ const NVPTXSubtarget *Subtarget) {
AtomicOrdering Ordering = N->getSuccessOrdering();
auto CodeAddrSpace = getCodeAddrSpace(N);
bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
- // TODO: lowering for SequentiallyConsistent Operations: for now, we error.
- // TODO: lowering for AcquireRelease Operations: for now, we error.
- //
-
// clang-format off
- // Lowering for non-SequentiallyConsistent Operations
+ // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
//
// | Atomic | Volatile | Statespace | Lowering sm_60- | Lowering sm_70+ |
// |---------|----------|-------------------------------|-----------------|------------------------------------------------------|
@@ -729,6 +732,18 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
// | Relaxed | Yes | Local / Const / Param | plain [1] | .weak [1] |
// | Other | Yes | Generic / Shared / Global [0] | Error [2] | <atomic sem> [3] |
+ // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
+ // by following the ABI proven sound in:
+ // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
+ // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
+ //
+ // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
+ // |-----------------------------------------------------------------------------|-----------------------------------------|
+ // | cuda::atomic_thread_fence(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; |
+ // | cuda::atomic_load(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; ld.acquire.<scope>; |
+ // | cuda::atomic_store(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; st.release.<scope>; |
+ // | cuda::atomic_fetch_<op>(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; atom.acq_rel.<scope>; |
+
// clang-format on
// [0]: volatile and atomics are only supported on generic addressing to
@@ -819,7 +834,25 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
N->print(OS);
report_fatal_error(OS.str());
}
- case AtomicOrdering::SequentiallyConsistent:
+ case AtomicOrdering::SequentiallyConsistent: {
+ unsigned int sem;
+ if (N->readMem()) {
+ sem = NVPTX::PTXLdStInstCode::Acquire;
+ } else if (N->writeMem()) {
+ sem = NVPTX::PTXLdStInstCode::Release;
+ } else {
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "NVPTX does not support SequentiallyConsistent Ordering on "
+ "read-modify-writes yet: "
+ << N->getOperationName();
+ N->print(OS);
+ report_fatal_error(OS.str());
+ }
+ return addrGenericOrGlobalOrShared
+ ? MemorySemantic(sem, NVPTX::PTXLdStInstCode::SeqCstFence)
+ : MemorySemantic(NVPTX::PTXLdStInstCode::NotAtomic);
+ }
case AtomicOrdering::Unordered:
default: {
// TODO: support AcquireRelease and SequentiallyConsistent
@@ -1043,7 +1076,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
}
// Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget);
+ auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(LD, Subtarget);
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
@@ -1088,7 +1121,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1103,7 +1137,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1125,7 +1160,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1146,7 +1182,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1190,7 +1227,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
// Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
+ auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
// Vector Setting
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1257,7 +1294,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
+ getI32Imm(CodeMemorySem, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1286,7 +1324,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
+ getI32Imm(CodeMemorySem, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1336,7 +1375,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
+ getI32Imm(CodeMemorySem, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1386,7 +1426,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
+ getI32Imm(CodeMemorySem, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1841,7 +1882,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
// Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget);
+ auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(ST, Subtarget);
// Vector Setting
MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1878,6 +1919,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
+ getI32Imm(SeqCstFence, dl),
getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -1895,6 +1937,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
+ getI32Imm(SeqCstFence, dl),
getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -1920,6 +1963,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
return false;
SDValue Ops[] = {Value,
+ getI32Imm(SeqCstFence, dl),
getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -1942,6 +1986,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
+ getI32Imm(SeqCstFence, dl),
getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -1982,7 +2027,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
// Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
+ auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
@@ -2024,6 +2069,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
ToTypeWidth = 32;
}
+ StOps.push_back(getI32Imm(SeqCstFence, DL));
StOps.push_back(getI32Imm(CodeMemorySem, DL));
StOps.push_back(getI32Imm(CodeAddrSpace, DL));
StOps.push_back(getI32Imm(VecType, DL));
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 9be8fe5d4e6ef..1dc2b9eba98cf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2939,39 +2939,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
multiclass LD<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _ari : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _ari_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _asi : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
}
@@ -2987,39 +2987,39 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _ari : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _asi : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
}
@@ -3038,75 +3038,75 @@ let mayStore=1, hasSideEffects=0 in {
multiclass LD_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v4_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
}
let mayLoad=1, hasSideEffects=0 in {
@@ -3121,84 +3121,84 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_ari : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_asi : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v4_avar : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_asi : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}"
"$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
}
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
index 2ae71bf1230e2..2265ec9e6b15d 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -165,6 +165,59 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam
ret void
}
+; CHECK-LABEL: generic_sc
+define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e seq_cst, align 8
+
+ ret void
+}
+
; CHECK-LABEL: generic_monotonic_volatile
define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
@@ -949,3 +1002,5 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
ret void
}
+
+
>From 08e280852c3ddec6309d2beac069d468b057c78b Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Thu, 11 Jul 2024 14:42:32 +0200
Subject: [PATCH 6/7] [NVPTX] Align Memory Ordering enum with LLVM
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 24 +--
llvm/lib/Target/NVPTX/NVPTX.h | 53 ++++++-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 146 ++++++++++--------
3 files changed, 139 insertions(+), 84 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 6f6f086f1fe95..1ac3fceb97c61 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -225,37 +225,41 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int) MO.getImm();
if (!strcmp(Modifier, "sem")) {
- switch (Imm) {
- case NVPTX::PTXLdStInstCode::NotAtomic:
+ auto ordering =
+ NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
+ switch (ordering) {
+ case NVPTX::Ordering::NotAtomic:
break;
- case NVPTX::PTXLdStInstCode::Volatile:
+ case NVPTX::Ordering::Volatile:
O << ".volatile";
break;
- case NVPTX::PTXLdStInstCode::Relaxed:
+ case NVPTX::Ordering::Relaxed:
O << ".relaxed.sys";
break;
- case NVPTX::PTXLdStInstCode::Acquire:
+ case NVPTX::Ordering::Acquire:
O << ".acquire.sys";
break;
- case NVPTX::PTXLdStInstCode::Release:
+ case NVPTX::Ordering::Release:
O << ".release.sys";
break;
- case NVPTX::PTXLdStInstCode::RelaxedMMIO:
+ case NVPTX::Ordering::RelaxedMMIO:
O << ".mmio.relaxed.sys";
break;
default:
SmallString<256> Msg;
raw_svector_ostream OS(Msg);
- OS << "NVPTX LdStCode Printer does not support \"" << Imm
+ OS << "NVPTX LdStCode Printer does not support \"" << ordering
<< "\" sem modifier.";
report_fatal_error(OS.str());
break;
}
} else if (!strcmp(Modifier, "sc")) {
- switch (Imm) {
+ auto ordering =
+ NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
+ switch (ordering) {
// TODO: refactor fence insertion in ISelDagToDag instead of here
// as part of implementing atomicrmw seq_cst.
- case NVPTX::PTXLdStInstCode::SeqCstFence:
+ case NVPTX::Ordering::SequentiallyConsistent:
O << "fence.sc.sys;\n\t";
break;
default:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index d6a4471e2bc33..602ab6e150e2a 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -106,16 +106,53 @@ enum LoadStore {
isStoreShift = 6
};
-namespace PTXLdStInstCode {
-enum MemorySemantic {
+// Extends LLVM AtomicOrdering with PTX Orderings:
+using OrderingUnderlyingType = unsigned int;
+enum class Ordering : OrderingUnderlyingType {
NotAtomic = 0, // PTX calls these: "Weak"
- Volatile = 1,
+ // Unordered = 1, // TODO: NVPTX should map this to "Relaxed"
Relaxed = 2,
- Acquire = 3,
- Release = 4,
- RelaxedMMIO = 5,
- SeqCstFence = 6,
+ // Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire"
+ Acquire = 4,
+ Release = 5,
+ // AcquireRelease = 6, // TODO
+ SequentiallyConsistent = 7,
+ Volatile = 8,
+ RelaxedMMIO = 9,
+ LAST = RelaxedMMIO
};
+
+template <typename OStream> OStream &operator<<(OStream &os, Ordering order) {
+ switch (order) {
+ case Ordering::NotAtomic:
+ os << "NotAtomic";
+ return os;
+ case Ordering::Relaxed:
+ os << "Relaxed";
+ return os;
+ case Ordering::Acquire:
+ os << "Acquire";
+ return os;
+ case Ordering::Release:
+ os << "Release";
+ return os;
+ // case Ordering::AcquireRelease:
+ // os << "AcquireRelease";
+ // return os;
+ case Ordering::SequentiallyConsistent:
+ os << "SequentiallyConsistent";
+ return os;
+ case Ordering::Volatile:
+ os << "Volatile";
+ return os;
+ case Ordering::RelaxedMMIO:
+ os << "RelaxedMMIO";
+ return os;
+ }
+ report_fatal_error("unknown ordering");
+}
+
+namespace PTXLdStInstCode {
enum AddressSpace {
GENERIC = 0,
GLOBAL = 1,
@@ -135,7 +172,7 @@ enum VecType {
V2 = 2,
V4 = 4
};
-}
+} // namespace PTXLdStInstCode
/// PTXCvtMode - Conversion code enumeration
namespace PTXCvtMode {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 64e08730af4d7..39a391aacdf82 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -700,15 +700,17 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
return NVPTX::PTXLdStInstCode::GENERIC;
}
-struct MemorySemantic {
- unsigned int sem = -1;
- unsigned int sc_fence = -1;
- MemorySemantic(unsigned int s) : sem(s) {}
- MemorySemantic(unsigned int s, unsigned int f) : sem(s), sc_fence(f) {}
+struct OperationOrderings {
+ NVPTX::OrderingUnderlyingType instr_ordering;
+ NVPTX::OrderingUnderlyingType fence_ordering;
+ OperationOrderings(NVPTX::Ordering o = NVPTX::Ordering::NotAtomic,
+ NVPTX::Ordering f = NVPTX::Ordering::NotAtomic)
+ : instr_ordering(static_cast<NVPTX::OrderingUnderlyingType>(o)),
+ fence_ordering(static_cast<NVPTX::OrderingUnderlyingType>(f)) {}
};
-static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
- const NVPTXSubtarget *Subtarget) {
+static OperationOrderings
+getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
AtomicOrdering Ordering = N->getSuccessOrdering();
auto CodeAddrSpace = getCodeAddrSpace(N);
@@ -760,7 +762,7 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
- return NVPTX::PTXLdStInstCode::NotAtomic;
+ return NVPTX::Ordering::NotAtomic;
}
// [2]: Atomics with Ordering different than Relaxed are not supported on
@@ -793,17 +795,19 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
switch (Ordering) {
case AtomicOrdering::NotAtomic:
return N->isVolatile() && AddrGenericOrGlobalOrShared
- ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ ? NVPTX::Ordering::Volatile
+ : NVPTX::Ordering::NotAtomic;
case AtomicOrdering::Monotonic:
if (N->isVolatile())
- return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO
- : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
+ : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile
+ : NVPTX::Ordering::NotAtomic;
else
- return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed
- : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
+ : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile
+ : NVPTX::Ordering::NotAtomic;
+ // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
+ // Acquire.
case AtomicOrdering::Acquire:
if (!N->readMem()) {
SmallString<256> Msg;
@@ -813,8 +817,8 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
N->print(OS);
report_fatal_error(OS.str());
}
- return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Acquire
+ : NVPTX::Ordering::NotAtomic;
case AtomicOrdering::Release:
if (!N->writeMem()) {
SmallString<256> Msg;
@@ -824,22 +828,30 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
N->print(OS);
report_fatal_error(OS.str());
}
- return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Release
+ : NVPTX::Ordering::NotAtomic;
case AtomicOrdering::AcquireRelease: {
SmallString<256> Msg;
raw_svector_ostream OS(Msg);
- OS << "PTX only supports AcquireRelease Ordering on read-modify-write: "
+ OS << "NVPTX does not support AcquireRelease Ordering on read-modify-write "
+ "yet and PTX does not support it on loads or stores: "
<< N->getOperationName();
N->print(OS);
report_fatal_error(OS.str());
}
case AtomicOrdering::SequentiallyConsistent: {
- unsigned int sem;
+ // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
+ // sequence including a "fence.sc.sco" and the memory instruction with an
+ // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
+ // whether the memory operation is a read, write, or read-modify-write.
+ //
+ // This sets the ordering of the fence to SequentiallyConsistent, and
+ // sets the corresponding ordering for the instruction.
+ NVPTX::Ordering ord;
if (N->readMem()) {
- sem = NVPTX::PTXLdStInstCode::Acquire;
+ ord = NVPTX::Ordering::Acquire;
} else if (N->writeMem()) {
- sem = NVPTX::PTXLdStInstCode::Release;
+ ord = NVPTX::Ordering::Release;
} else {
SmallString<256> Msg;
raw_svector_ostream OS(Msg);
@@ -849,22 +861,20 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
N->print(OS);
report_fatal_error(OS.str());
}
- return addrGenericOrGlobalOrShared
- ? MemorySemantic(sem, NVPTX::PTXLdStInstCode::SeqCstFence)
- : MemorySemantic(NVPTX::PTXLdStInstCode::NotAtomic);
+ return AddrGenericOrGlobalOrShared
+ ? OperationOrderings(ord,
+ NVPTX::Ordering::SequentiallyConsistent)
+ : OperationOrderings(NVPTX::Ordering::NotAtomic);
}
case AtomicOrdering::Unordered:
- default: {
- // TODO: support AcquireRelease and SequentiallyConsistent
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "NVPTX backend does not support AtomicOrdering \""
- << toIRString(Ordering) << "\" yet.";
- report_fatal_error(OS.str());
- }
+ break;
}
- report_fatal_error("unreachable");
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "NVPTX backend does not support AtomicOrdering \""
+ << toIRString(Ordering) << "\" yet.";
+ report_fatal_error(OS.str());
}
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
@@ -1076,7 +1086,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
}
// Memory Semantic Setting
- auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(LD, Subtarget);
+ auto [InstructionOrdering, FenceOrdering] =
+ getOperationOrderings(LD, Subtarget);
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
@@ -1121,8 +1132,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1137,8 +1148,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1160,8 +1171,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1182,8 +1193,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1227,7 +1238,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
// Memory Semantic Setting
- auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
+ auto [InstructionOrdering, FenceOrdering] =
+ getOperationOrderings(MemSD, Subtarget);
// Vector Setting
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1294,8 +1306,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
- getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
+ getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1324,8 +1336,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
- getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
+ getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1375,8 +1387,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
- getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
+ getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1426,8 +1438,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
- getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
+ getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1882,7 +1894,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
// Memory Semantic Setting
- auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(ST, Subtarget);
+ auto [InstructionOrdering, FenceOrdering] =
+ getOperationOrderings(ST, Subtarget);
// Vector Setting
MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1919,8 +1932,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -1937,8 +1950,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -1963,8 +1976,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
return false;
SDValue Ops[] = {Value,
- getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -1986,8 +1999,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -2027,7 +2040,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
// Memory Semantic Setting
- auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
+ auto [InstructionOrdering, FenceOrdering] =
+ getOperationOrderings(MemSD, Subtarget);
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
@@ -2069,8 +2083,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
ToTypeWidth = 32;
}
- StOps.push_back(getI32Imm(SeqCstFence, DL));
- StOps.push_back(getI32Imm(CodeMemorySem, DL));
+ StOps.push_back(getI32Imm(FenceOrdering, DL));
+ StOps.push_back(getI32Imm(InstructionOrdering, DL));
StOps.push_back(getI32Imm(CodeAddrSpace, DL));
StOps.push_back(getI32Imm(VecType, DL));
StOps.push_back(getI32Imm(ToType, DL));
>From 00166ffbd43e7504ed0728a4f0586204166aea19 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Thu, 11 Jul 2024 18:50:37 +0200
Subject: [PATCH 7/7] [NVPTX]: Fix typos
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 10 +++---
llvm/lib/Target/NVPTX/NVPTX.h | 36 +++++++++----------
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 16 ++++-----
3 files changed, 31 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 1ac3fceb97c61..9f1e56cd84e51 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -225,9 +225,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int) MO.getImm();
if (!strcmp(Modifier, "sem")) {
- auto ordering =
+ auto Ordering =
NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
- switch (ordering) {
+ switch (Ordering) {
case NVPTX::Ordering::NotAtomic:
break;
case NVPTX::Ordering::Volatile:
@@ -248,15 +248,15 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
default:
SmallString<256> Msg;
raw_svector_ostream OS(Msg);
- OS << "NVPTX LdStCode Printer does not support \"" << ordering
+ OS << "NVPTX LdStCode Printer does not support \"" << Ordering
<< "\" sem modifier.";
report_fatal_error(OS.str());
break;
}
} else if (!strcmp(Modifier, "sc")) {
- auto ordering =
+ auto Ordering =
NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
- switch (ordering) {
+ switch (Ordering) {
// TODO: refactor fence insertion in ISelDagToDag instead of here
// as part of implementing atomicrmw seq_cst.
case NVPTX::Ordering::SequentiallyConsistent:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 602ab6e150e2a..c9cce23788ca4 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -122,32 +122,32 @@ enum class Ordering : OrderingUnderlyingType {
LAST = RelaxedMMIO
};
-template <typename OStream> OStream &operator<<(OStream &os, Ordering order) {
- switch (order) {
+template <typename OStream> OStream &operator<<(OStream &O, Ordering Order) {
+ switch (Order) {
case Ordering::NotAtomic:
- os << "NotAtomic";
- return os;
+ O << "NotAtomic";
+ return O;
case Ordering::Relaxed:
- os << "Relaxed";
- return os;
+ O << "Relaxed";
+ return O;
case Ordering::Acquire:
- os << "Acquire";
- return os;
+ O << "Acquire";
+ return O;
case Ordering::Release:
- os << "Release";
- return os;
+ O << "Release";
+ return O;
// case Ordering::AcquireRelease:
- // os << "AcquireRelease";
- // return os;
+ // O << "AcquireRelease";
+ // return O;
case Ordering::SequentiallyConsistent:
- os << "SequentiallyConsistent";
- return os;
+ O << "SequentiallyConsistent";
+ return O;
case Ordering::Volatile:
- os << "Volatile";
- return os;
+ O << "Volatile";
+ return O;
case Ordering::RelaxedMMIO:
- os << "RelaxedMMIO";
- return os;
+ O << "RelaxedMMIO";
+ return O;
}
report_fatal_error("unknown ordering");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 39a391aacdf82..41a3c2102427f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -701,12 +701,12 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
}
struct OperationOrderings {
- NVPTX::OrderingUnderlyingType instr_ordering;
- NVPTX::OrderingUnderlyingType fence_ordering;
+ NVPTX::OrderingUnderlyingType InstrOrdering;
+ NVPTX::OrderingUnderlyingType FenceOrdering;
OperationOrderings(NVPTX::Ordering o = NVPTX::Ordering::NotAtomic,
NVPTX::Ordering f = NVPTX::Ordering::NotAtomic)
- : instr_ordering(static_cast<NVPTX::OrderingUnderlyingType>(o)),
- fence_ordering(static_cast<NVPTX::OrderingUnderlyingType>(f)) {}
+ : InstrOrdering(static_cast<NVPTX::OrderingUnderlyingType>(o)),
+ FenceOrdering(static_cast<NVPTX::OrderingUnderlyingType>(f)) {}
};
static OperationOrderings
@@ -847,11 +847,11 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
//
// This sets the ordering of the fence to SequentiallyConsistent, and
// sets the corresponding ordering for the instruction.
- NVPTX::Ordering ord;
+ NVPTX::Ordering InstrOrder;
if (N->readMem()) {
- ord = NVPTX::Ordering::Acquire;
+ InstrOrder = NVPTX::Ordering::Acquire;
} else if (N->writeMem()) {
- ord = NVPTX::Ordering::Release;
+ InstrOrder = NVPTX::Ordering::Release;
} else {
SmallString<256> Msg;
raw_svector_ostream OS(Msg);
@@ -862,7 +862,7 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
report_fatal_error(OS.str());
}
return AddrGenericOrGlobalOrShared
- ? OperationOrderings(ord,
+ ? OperationOrderings(InstrOrder,
NVPTX::Ordering::SequentiallyConsistent)
: OperationOrderings(NVPTX::Ordering::NotAtomic);
}
More information about the llvm-commits
mailing list