[llvm] [NVPTX] Add Volta Atomic SequentiallyConsistent Load and Store Operations (PR #98551)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 19 09:17:40 PDT 2024
https://github.com/gonzalobg updated https://github.com/llvm/llvm-project/pull/98551
>From 1a395c1635434b1153907e3ce28e52f579e1373a Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Mon, 15 Jul 2024 12:23:44 -0700
Subject: [PATCH 1/8] [NVPTX] Support fence instruction
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 29 ++++++++++++++++++++
llvm/test/CodeGen/NVPTX/fence.ll | 36 +++++++++++++++++++++++++
2 files changed, 65 insertions(+)
create mode 100644 llvm/test/CodeGen/NVPTX/fence.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 7f1ac8688007e..c5146e9f33088 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3908,3 +3908,32 @@ def : Pat <
(V2I32toI64
(INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
(INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;
+
+
+////////////////////////////////////////////////////////////////////////////////
+// PTX Fence instructions
+////////////////////////////////////////////////////////////////////////////////
+
+def atomic_thread_fence_seq_cst_sys :
+ NVPTXInst<(outs), (ins), "fence.sc.sys;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+def atomic_thread_fence_acq_rel_sys :
+ NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
+ Requires<[hasPTX<60>, hasSM<70>]>;
+
+def atomic_thread_fence_seq_cst_sys_membar :
+ NVPTXInst<(outs), (ins), "membar.sys;", []>;
+
+def : Pat<(atomic_fence (i64 4), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acquire(4) sys(1)
+ Requires<[hasPTX<60>, hasSM<70>]>;
+def : Pat<(atomic_fence (i64 5), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // release(5) sys(1)
+ Requires<[hasPTX<60>, hasSM<70>]>;
+def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acq_rel(6) sys(1)
+ Requires<[hasPTX<60>, hasSM<70>]>;
+def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys)>, // seq_cst(7) sys(1)
+ Requires<[hasPTX<60>, hasSM<70>]>;
+
+def : Pat<(atomic_fence (i64 4), (i64 1)), (atomic_thread_fence_seq_cst_sys_membar)>; // acquire(4) sys(1)
+def : Pat<(atomic_fence (i64 5), (i64 1)), (atomic_thread_fence_seq_cst_sys_membar)>; // release(5) sys(1)
+def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_seq_cst_sys_membar)>; // acq_rel(6) sys(1)
+def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys_membar)>; // seq_cst(7) sys(1)
diff --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll
new file mode 100644
index 0000000000000..d3aace95e9665
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=SM60
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %}
+
+; CHECK-LABEL: fence_sc_sys
+define void @fence_sc_sys() local_unnamed_addr {
+ ; SM60: membar.sys
+ ; SM70: fence.sc.sys
+ fence seq_cst
+ ret void
+}
+
+; CHECK-LABEL: fence_acq_rel_sys
+define void @fence_acq_rel_sys() local_unnamed_addr {
+ ; SM60: membar.sys
+ ; SM70: fence.acq_rel.sys
+ fence acq_rel
+ ret void
+}
+
+; CHECK-LABEL: fence_release_sys
+define void @fence_release_sys() local_unnamed_addr {
+ ; SM60: membar.sys
+ ; SM70: fence.acq_rel.sys
+ fence release
+ ret void
+}
+
+; CHECK-LABEL: fence_acquire_sys
+define void @fence_acquire_sys() local_unnamed_addr {
+ ; SM60: membar.sys
+ ; SM70: fence.acq_rel.sys
+ fence acquire
+ ret void
+}
\ No newline at end of file
>From 021fd6df55fa154e22e5adf519b187b1387ad2c5 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Tue, 16 Jul 2024 07:00:13 -0700
Subject: [PATCH 2/8] Fixup: use INT_MEMBAR_SYS instead of redefining the
membar.sys opcode
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index c5146e9f33088..b685949c5eb79 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3921,9 +3921,6 @@ def atomic_thread_fence_acq_rel_sys :
NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
Requires<[hasPTX<60>, hasSM<70>]>;
-def atomic_thread_fence_seq_cst_sys_membar :
- NVPTXInst<(outs), (ins), "membar.sys;", []>;
-
def : Pat<(atomic_fence (i64 4), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // acquire(4) sys(1)
Requires<[hasPTX<60>, hasSM<70>]>;
def : Pat<(atomic_fence (i64 5), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, // release(5) sys(1)
@@ -3933,7 +3930,7 @@ def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_acq_rel_sys)>, /
def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys)>, // seq_cst(7) sys(1)
Requires<[hasPTX<60>, hasSM<70>]>;
-def : Pat<(atomic_fence (i64 4), (i64 1)), (atomic_thread_fence_seq_cst_sys_membar)>; // acquire(4) sys(1)
-def : Pat<(atomic_fence (i64 5), (i64 1)), (atomic_thread_fence_seq_cst_sys_membar)>; // release(5) sys(1)
-def : Pat<(atomic_fence (i64 6), (i64 1)), (atomic_thread_fence_seq_cst_sys_membar)>; // acq_rel(6) sys(1)
-def : Pat<(atomic_fence (i64 7), (i64 1)), (atomic_thread_fence_seq_cst_sys_membar)>; // seq_cst(7) sys(1)
+def : Pat<(atomic_fence (i64 4), (i64 1)), (INT_MEMBAR_SYS)>; // acquire(4) sys(1)
+def : Pat<(atomic_fence (i64 5), (i64 1)), (INT_MEMBAR_SYS)>; // release(5) sys(1)
+def : Pat<(atomic_fence (i64 6), (i64 1)), (INT_MEMBAR_SYS)>; // acq_rel(6) sys(1)
+def : Pat<(atomic_fence (i64 7), (i64 1)), (INT_MEMBAR_SYS)>; // seq_cst(7) sys(1)
>From 93b5524a074e9e3993505f3d5895f59f2ab61bbe Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Tue, 9 Jul 2024 18:17:44 +0200
Subject: [PATCH 3/8] [NVPTX] Volta SequentiallyConsistent Load/Store Ops
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 10 ++
llvm/lib/Target/NVPTX/NVPTX.h | 3 +-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 87 ++++++++---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 144 +++++++++---------
llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 55 +++++++
5 files changed, 205 insertions(+), 94 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index a004d64c21cc6..addb326e156f0 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -254,6 +254,16 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
report_fatal_error(OS.str());
break;
}
+ } else if (!strcmp(Modifier, "sc")) {
+ switch (Imm) {
+ // TODO: refactor fence insertion in ISelDagToDag instead of here
+ // as part of implementing atomicrmw seq_cst.
+ case NVPTX::PTXLdStInstCode::SeqCstFence:
+ O << "fence.sc.sys;\n\t";
+ break;
+ default:
+ break;
+ }
} else if (!strcmp(Modifier, "addsp")) {
switch (Imm) {
case NVPTX::PTXLdStInstCode::GLOBAL:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 3c7167b157025..d6a4471e2bc33 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -113,7 +113,8 @@ enum MemorySemantic {
Relaxed = 2,
Acquire = 3,
Release = 4,
- RelaxedMMIO = 5
+ RelaxedMMIO = 5,
+ SeqCstFence = 6,
};
enum AddressSpace {
GENERIC = 0,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 371ec8596ef63..330f81c6679e7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -714,21 +714,24 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
return NVPTX::PTXLdStInstCode::GENERIC;
}
-static unsigned int getCodeMemorySemantic(MemSDNode *N,
- const NVPTXSubtarget *Subtarget) {
+struct MemorySemantic {
+ unsigned int sem = -1;
+ unsigned int sc_fence = -1;
+ MemorySemantic(unsigned int s) : sem(s) {}
+ MemorySemantic(unsigned int s, unsigned int f) : sem(s), sc_fence(f) {}
+};
+
+static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
+ const NVPTXSubtarget *Subtarget) {
AtomicOrdering Ordering = N->getSuccessOrdering();
auto CodeAddrSpace = getCodeAddrSpace(N);
bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
- // TODO: lowering for SequentiallyConsistent Operations: for now, we error.
- // TODO: lowering for AcquireRelease Operations: for now, we error.
- //
-
// clang-format off
- // Lowering for non-SequentiallyConsistent Operations
+ // Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
//
// | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
// |---------|----------|--------------------|------------|------------------------------|
@@ -748,6 +751,18 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
// | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
// | | | / Global [0] | | |
+ // Lowering of CUDA C++ SequentiallyConsistent Operations and Fences to PTX
+ // by following the ABI proven sound in:
+ // Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
+ // https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
+ //
+ // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
+ // |-----------------------------------------------------------------------------|-----------------------------------------|
+ // | cuda::atomic_thread_fence(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; |
+ // | cuda::atomic_load(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; ld.acquire.<scope>; |
+ // | cuda::atomic_store(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; st.release.<scope>; |
+ // | cuda::atomic_fetch_<op>(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; atom.acq_rel.<scope>; |
+
// clang-format on
// [0]: volatile and atomics are only supported on global or shared
@@ -787,7 +802,6 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
// - the "weak" memory instruction we are currently lowering to, and
// - some other instruction that preserves the side-effect, e.g.,
// a dead dummy volatile load.
-
if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
@@ -865,7 +879,25 @@ static unsigned int getCodeMemorySemantic(MemSDNode *N,
N->print(OS);
report_fatal_error(OS.str());
}
- case AtomicOrdering::SequentiallyConsistent:
+ case AtomicOrdering::SequentiallyConsistent: {
+ unsigned int sem;
+ if (N->readMem()) {
+ sem = NVPTX::PTXLdStInstCode::Acquire;
+ } else if (N->writeMem()) {
+ sem = NVPTX::PTXLdStInstCode::Release;
+ } else {
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "NVPTX does not support SequentiallyConsistent Ordering on "
+ "read-modify-writes yet: "
+ << N->getOperationName();
+ N->print(OS);
+ report_fatal_error(OS.str());
+ }
+ return addrGenericOrGlobalOrShared
+ ? MemorySemantic(sem, NVPTX::PTXLdStInstCode::SeqCstFence)
+ : MemorySemantic(NVPTX::PTXLdStInstCode::NotAtomic);
+ }
case AtomicOrdering::Unordered:
// TODO: support AcquireRelease and SequentiallyConsistent
SmallString<256> Msg;
@@ -1087,7 +1119,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
}
// Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget);
+ auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(LD, Subtarget);
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
@@ -1132,7 +1164,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1147,7 +1180,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1169,7 +1203,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1190,7 +1225,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
+ getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1234,7 +1270,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
// Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
+ auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
// Vector Setting
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1301,7 +1337,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
+ getI32Imm(CodeMemorySem, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1330,7 +1367,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
+ getI32Imm(CodeMemorySem, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1380,7 +1418,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
+ getI32Imm(CodeMemorySem, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1430,7 +1469,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
+ getI32Imm(CodeMemorySem, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1885,7 +1925,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
// Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget);
+ auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(ST, Subtarget);
// Vector Setting
MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1922,6 +1962,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
+ getI32Imm(SeqCstFence, dl),
getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -1939,6 +1980,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
+ getI32Imm(SeqCstFence, dl),
getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -1964,6 +2006,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
return false;
SDValue Ops[] = {Value,
+ getI32Imm(SeqCstFence, dl),
getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -1986,6 +2029,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
+ getI32Imm(SeqCstFence, dl),
getI32Imm(CodeMemorySem, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -2026,7 +2070,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
// Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
+ auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
@@ -2068,6 +2112,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
ToTypeWidth = 32;
}
+ StOps.push_back(getI32Imm(SeqCstFence, DL));
StOps.push_back(getI32Imm(CodeMemorySem, DL));
StOps.push_back(getI32Imm(CodeAddrSpace, DL));
StOps.push_back(getI32Imm(VecType, DL));
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b685949c5eb79..c2cf0c8fc05e9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2958,39 +2958,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
multiclass LD<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _ari : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _ari_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _asi : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
}
@@ -3006,39 +3006,39 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _ari : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _asi : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
}
@@ -3057,75 +3057,75 @@ let mayStore=1, hasSideEffects=0 in {
multiclass LD_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v4_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
}
let mayLoad=1, hasSideEffects=0 in {
@@ -3140,84 +3140,84 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_ari : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_asi : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v4_avar : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_asi : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+ "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}"
"$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
}
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
index 7cdced1778a53..2a74b24084434 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -165,6 +165,59 @@ define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnam
ret void
}
+; CHECK-LABEL: generic_sc
+define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a seq_cst, align 1
+ %a.add = add i8 %a.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a seq_cst, align 1
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b seq_cst, align 2
+ %b.add = add i16 %b.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b seq_cst, align 2
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c seq_cst, align 4
+ %c.add = add i32 %c.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d seq_cst, align 8
+ %d.add = add i64 %d.load, 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d seq_cst, align 8
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e seq_cst, align 4
+
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e seq_cst, align 8
+
+ ret void
+}
+
; CHECK-LABEL: generic_monotonic_volatile
define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
@@ -949,3 +1002,5 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
ret void
}
+
+
>From 8d1eb2ee1b8c626eb27a1a239cacd68142d83328 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Thu, 11 Jul 2024 14:42:32 +0200
Subject: [PATCH 4/8] [NVPTX] Align Memory Ordering enum with LLVM
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 24 ++--
llvm/lib/Target/NVPTX/NVPTX.h | 53 +++++--
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 131 ++++++++++--------
3 files changed, 133 insertions(+), 75 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index addb326e156f0..d7a3daa450f0f 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -228,37 +228,41 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int) MO.getImm();
if (!strcmp(Modifier, "sem")) {
- switch (Imm) {
- case NVPTX::PTXLdStInstCode::NotAtomic:
+ auto ordering =
+ NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
+ switch (ordering) {
+ case NVPTX::Ordering::NotAtomic:
break;
- case NVPTX::PTXLdStInstCode::Volatile:
+ case NVPTX::Ordering::Volatile:
O << ".volatile";
break;
- case NVPTX::PTXLdStInstCode::Relaxed:
+ case NVPTX::Ordering::Relaxed:
O << ".relaxed.sys";
break;
- case NVPTX::PTXLdStInstCode::Acquire:
+ case NVPTX::Ordering::Acquire:
O << ".acquire.sys";
break;
- case NVPTX::PTXLdStInstCode::Release:
+ case NVPTX::Ordering::Release:
O << ".release.sys";
break;
- case NVPTX::PTXLdStInstCode::RelaxedMMIO:
+ case NVPTX::Ordering::RelaxedMMIO:
O << ".mmio.relaxed.sys";
break;
default:
SmallString<256> Msg;
raw_svector_ostream OS(Msg);
- OS << "NVPTX LdStCode Printer does not support \"" << Imm
+ OS << "NVPTX LdStCode Printer does not support \"" << ordering
<< "\" sem modifier.";
report_fatal_error(OS.str());
break;
}
} else if (!strcmp(Modifier, "sc")) {
- switch (Imm) {
+ auto ordering =
+ NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
+ switch (ordering) {
// TODO: refactor fence insertion in ISelDagToDag instead of here
// as part of implementing atomicrmw seq_cst.
- case NVPTX::PTXLdStInstCode::SeqCstFence:
+ case NVPTX::Ordering::SequentiallyConsistent:
O << "fence.sc.sys;\n\t";
break;
default:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index d6a4471e2bc33..602ab6e150e2a 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -106,16 +106,53 @@ enum LoadStore {
isStoreShift = 6
};
-namespace PTXLdStInstCode {
-enum MemorySemantic {
+// Extends LLVM AtomicOrdering with PTX Orderings:
+using OrderingUnderlyingType = unsigned int;
+enum class Ordering : OrderingUnderlyingType {
NotAtomic = 0, // PTX calls these: "Weak"
- Volatile = 1,
+ // Unordered = 1, // TODO: NVPTX should map this to "Relaxed"
Relaxed = 2,
- Acquire = 3,
- Release = 4,
- RelaxedMMIO = 5,
- SeqCstFence = 6,
+ // Consume = 3, // Unimplemented in LLVM; NVPTX would map to "Acquire"
+ Acquire = 4,
+ Release = 5,
+ // AcquireRelease = 6, // TODO
+ SequentiallyConsistent = 7,
+ Volatile = 8,
+ RelaxedMMIO = 9,
+ LAST = RelaxedMMIO
};
+
+template <typename OStream> OStream &operator<<(OStream &os, Ordering order) {
+ switch (order) {
+ case Ordering::NotAtomic:
+ os << "NotAtomic";
+ return os;
+ case Ordering::Relaxed:
+ os << "Relaxed";
+ return os;
+ case Ordering::Acquire:
+ os << "Acquire";
+ return os;
+ case Ordering::Release:
+ os << "Release";
+ return os;
+ // case Ordering::AcquireRelease:
+ // os << "AcquireRelease";
+ // return os;
+ case Ordering::SequentiallyConsistent:
+ os << "SequentiallyConsistent";
+ return os;
+ case Ordering::Volatile:
+ os << "Volatile";
+ return os;
+ case Ordering::RelaxedMMIO:
+ os << "RelaxedMMIO";
+ return os;
+ }
+ report_fatal_error("unknown ordering");
+}
+
+namespace PTXLdStInstCode {
enum AddressSpace {
GENERIC = 0,
GLOBAL = 1,
@@ -135,7 +172,7 @@ enum VecType {
V2 = 2,
V4 = 4
};
-}
+} // namespace PTXLdStInstCode
/// PTXCvtMode - Conversion code enumeration
namespace PTXCvtMode {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 330f81c6679e7..e23783bfde22e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -714,15 +714,17 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
return NVPTX::PTXLdStInstCode::GENERIC;
}
-struct MemorySemantic {
- unsigned int sem = -1;
- unsigned int sc_fence = -1;
- MemorySemantic(unsigned int s) : sem(s) {}
- MemorySemantic(unsigned int s, unsigned int f) : sem(s), sc_fence(f) {}
+struct OperationOrderings {
+ NVPTX::OrderingUnderlyingType instr_ordering;
+ NVPTX::OrderingUnderlyingType fence_ordering;
+ OperationOrderings(NVPTX::Ordering o = NVPTX::Ordering::NotAtomic,
+ NVPTX::Ordering f = NVPTX::Ordering::NotAtomic)
+ : instr_ordering(static_cast<NVPTX::OrderingUnderlyingType>(o)),
+ fence_ordering(static_cast<NVPTX::OrderingUnderlyingType>(f)) {}
};
-static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
- const NVPTXSubtarget *Subtarget) {
+static OperationOrderings
+getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
AtomicOrdering Ordering = N->getSuccessOrdering();
auto CodeAddrSpace = getCodeAddrSpace(N);
@@ -805,7 +807,7 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
- return NVPTX::PTXLdStInstCode::NotAtomic;
+ return NVPTX::Ordering::NotAtomic;
}
// [2]: Atomics with Ordering different than Relaxed are not supported on
@@ -838,17 +840,19 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
switch (Ordering) {
case AtomicOrdering::NotAtomic:
return N->isVolatile() && AddrGenericOrGlobalOrShared
- ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ ? NVPTX::Ordering::Volatile
+ : NVPTX::Ordering::NotAtomic;
case AtomicOrdering::Monotonic:
if (N->isVolatile())
- return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO
- : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ return UseRelaxedMMIO ? NVPTX::Ordering::RelaxedMMIO
+ : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile
+ : NVPTX::Ordering::NotAtomic;
else
- return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed
- : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ return HasMemoryOrdering ? NVPTX::Ordering::Relaxed
+ : AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Volatile
+ : NVPTX::Ordering::NotAtomic;
+ // case AtomicOrdering::Consume: // If LLVM ever provides this, lower it to
+ // Acquire.
case AtomicOrdering::Acquire:
if (!N->readMem()) {
SmallString<256> Msg;
@@ -858,8 +862,8 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
N->print(OS);
report_fatal_error(OS.str());
}
- return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Acquire
+ : NVPTX::Ordering::NotAtomic;
case AtomicOrdering::Release:
if (!N->writeMem()) {
SmallString<256> Msg;
@@ -869,22 +873,30 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
N->print(OS);
report_fatal_error(OS.str());
}
- return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release
- : NVPTX::PTXLdStInstCode::NotAtomic;
+ return AddrGenericOrGlobalOrShared ? NVPTX::Ordering::Release
+ : NVPTX::Ordering::NotAtomic;
case AtomicOrdering::AcquireRelease: {
SmallString<256> Msg;
raw_svector_ostream OS(Msg);
- OS << "PTX only supports AcquireRelease Ordering on read-modify-write: "
+ OS << "NVPTX does not support AcquireRelease Ordering on read-modify-write "
+ "yet and PTX does not support it on loads or stores: "
<< N->getOperationName();
N->print(OS);
report_fatal_error(OS.str());
}
case AtomicOrdering::SequentiallyConsistent: {
- unsigned int sem;
+ // LLVM-IR SequentiallyConsistent atomics map to a two-instruction PTX
+ // sequence including a "fence.sc.sco" and the memory instruction with an
+ // Ordering that differs from "sc": acq, rel, or acq_rel, depending on
+ // whether the memory operation is a read, write, or read-modify-write.
+ //
+ // This sets the ordering of the fence to SequentiallyConsistent, and
+ // sets the corresponding ordering for the instruction.
+ NVPTX::Ordering ord;
if (N->readMem()) {
- sem = NVPTX::PTXLdStInstCode::Acquire;
+ ord = NVPTX::Ordering::Acquire;
} else if (N->writeMem()) {
- sem = NVPTX::PTXLdStInstCode::Release;
+ ord = NVPTX::Ordering::Release;
} else {
SmallString<256> Msg;
raw_svector_ostream OS(Msg);
@@ -894,9 +906,10 @@ static MemorySemantic getCodeMemorySemantic(MemSDNode *N,
N->print(OS);
report_fatal_error(OS.str());
}
- return addrGenericOrGlobalOrShared
- ? MemorySemantic(sem, NVPTX::PTXLdStInstCode::SeqCstFence)
- : MemorySemantic(NVPTX::PTXLdStInstCode::NotAtomic);
+ return AddrGenericOrGlobalOrShared
+ ? OperationOrderings(ord,
+ NVPTX::Ordering::SequentiallyConsistent)
+ : OperationOrderings(NVPTX::Ordering::NotAtomic);
}
case AtomicOrdering::Unordered:
// TODO: support AcquireRelease and SequentiallyConsistent
@@ -1119,7 +1132,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
}
// Memory Semantic Setting
- auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(LD, Subtarget);
+ auto [InstructionOrdering, FenceOrdering] =
+ getOperationOrderings(LD, Subtarget);
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
@@ -1164,8 +1178,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1180,8 +1194,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1203,8 +1217,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1225,8 +1239,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1270,7 +1284,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
// Memory Semantic Setting
- auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
+ auto [InstructionOrdering, FenceOrdering] =
+ getOperationOrderings(MemSD, Subtarget);
// Vector Setting
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1337,8 +1352,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
- getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
+ getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1367,8 +1382,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
- getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
+ getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1418,8 +1433,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
- getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
+ getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1469,8 +1484,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(SeqCstFence, DL),
- getI32Imm(CodeMemorySem, DL),
+ SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
+ getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1925,7 +1940,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
// Memory Semantic Setting
- auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(ST, Subtarget);
+ auto [InstructionOrdering, FenceOrdering] =
+ getOperationOrderings(ST, Subtarget);
// Vector Setting
MVT SimpleVT = StoreVT.getSimpleVT();
@@ -1962,8 +1978,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -1980,8 +1996,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -2006,8 +2022,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
return false;
SDValue Ops[] = {Value,
- getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -2029,8 +2045,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(SeqCstFence, dl),
- getI32Imm(CodeMemorySem, dl),
+ getI32Imm(FenceOrdering, dl),
+ getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(toType, dl),
@@ -2070,7 +2086,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
// Memory Semantic Setting
- auto [CodeMemorySem, SeqCstFence] = getCodeMemorySemantic(MemSD, Subtarget);
+ auto [InstructionOrdering, FenceOrdering] =
+ getOperationOrderings(MemSD, Subtarget);
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
@@ -2112,8 +2129,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
ToTypeWidth = 32;
}
- StOps.push_back(getI32Imm(SeqCstFence, DL));
- StOps.push_back(getI32Imm(CodeMemorySem, DL));
+ StOps.push_back(getI32Imm(FenceOrdering, DL));
+ StOps.push_back(getI32Imm(InstructionOrdering, DL));
StOps.push_back(getI32Imm(CodeAddrSpace, DL));
StOps.push_back(getI32Imm(VecType, DL));
StOps.push_back(getI32Imm(ToType, DL));
>From f6f22b3492bb0a138d39ae1bd3b68e99b50fa1dd Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Thu, 11 Jul 2024 18:50:37 +0200
Subject: [PATCH 5/8] [NVPTX]: Fix typos
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 10 +++---
llvm/lib/Target/NVPTX/NVPTX.h | 36 +++++++++----------
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 35 ++++++++++--------
llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 2 --
4 files changed, 44 insertions(+), 39 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index d7a3daa450f0f..45561d5a11238 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -228,9 +228,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int) MO.getImm();
if (!strcmp(Modifier, "sem")) {
- auto ordering =
+ auto Ordering =
NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
- switch (ordering) {
+ switch (Ordering) {
case NVPTX::Ordering::NotAtomic:
break;
case NVPTX::Ordering::Volatile:
@@ -251,15 +251,15 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
default:
SmallString<256> Msg;
raw_svector_ostream OS(Msg);
- OS << "NVPTX LdStCode Printer does not support \"" << ordering
+ OS << "NVPTX LdStCode Printer does not support \"" << Ordering
<< "\" sem modifier.";
report_fatal_error(OS.str());
break;
}
} else if (!strcmp(Modifier, "sc")) {
- auto ordering =
+ auto Ordering =
NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
- switch (ordering) {
+ switch (Ordering) {
// TODO: refactor fence insertion in ISelDagToDag instead of here
// as part of implementing atomicrmw seq_cst.
case NVPTX::Ordering::SequentiallyConsistent:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 602ab6e150e2a..c9cce23788ca4 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -122,32 +122,32 @@ enum class Ordering : OrderingUnderlyingType {
LAST = RelaxedMMIO
};
-template <typename OStream> OStream &operator<<(OStream &os, Ordering order) {
- switch (order) {
+template <typename OStream> OStream &operator<<(OStream &O, Ordering Order) {
+ switch (Order) {
case Ordering::NotAtomic:
- os << "NotAtomic";
- return os;
+ O << "NotAtomic";
+ return O;
case Ordering::Relaxed:
- os << "Relaxed";
- return os;
+ O << "Relaxed";
+ return O;
case Ordering::Acquire:
- os << "Acquire";
- return os;
+ O << "Acquire";
+ return O;
case Ordering::Release:
- os << "Release";
- return os;
+ O << "Release";
+ return O;
// case Ordering::AcquireRelease:
- // os << "AcquireRelease";
- // return os;
+ // O << "AcquireRelease";
+ // return O;
case Ordering::SequentiallyConsistent:
- os << "SequentiallyConsistent";
- return os;
+ O << "SequentiallyConsistent";
+ return O;
case Ordering::Volatile:
- os << "Volatile";
- return os;
+ O << "Volatile";
+ return O;
case Ordering::RelaxedMMIO:
- os << "RelaxedMMIO";
- return os;
+ O << "RelaxedMMIO";
+ return O;
}
report_fatal_error("unknown ordering");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index e23783bfde22e..a75d050ff7ecc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -715,12 +715,12 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
}
struct OperationOrderings {
- NVPTX::OrderingUnderlyingType instr_ordering;
- NVPTX::OrderingUnderlyingType fence_ordering;
+ NVPTX::OrderingUnderlyingType InstrOrdering;
+ NVPTX::OrderingUnderlyingType FenceOrdering;
OperationOrderings(NVPTX::Ordering o = NVPTX::Ordering::NotAtomic,
NVPTX::Ordering f = NVPTX::Ordering::NotAtomic)
- : instr_ordering(static_cast<NVPTX::OrderingUnderlyingType>(o)),
- fence_ordering(static_cast<NVPTX::OrderingUnderlyingType>(f)) {}
+ : InstrOrdering(static_cast<NVPTX::OrderingUnderlyingType>(o)),
+ FenceOrdering(static_cast<NVPTX::OrderingUnderlyingType>(f)) {}
};
static OperationOrderings
@@ -758,12 +758,19 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
// Lustig et al, A Formal Analysis of the NVIDIA PTX Memory Consistency Model, ASPLOS’19.
// https://dl.acm.org/doi/pdf/10.1145/3297858.3304043
//
- // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
- // |-----------------------------------------------------------------------------|-----------------------------------------|
- // | cuda::atomic_thread_fence(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; |
- // | cuda::atomic_load(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; ld.acquire.<scope>; |
- // | cuda::atomic_store(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; st.release.<scope>; |
- // | cuda::atomic_fetch_<op>(memory_order_seq_cst, cuda::thread_scope_<scope>) | fence.sc.<scope>; atom.acq_rel.<scope>; |
+ // | CUDA C++ Atomic Operation or Atomic Fence | PTX Atomic Operation or Fence |
+ // |------------------------------------------------------|-------------------------------|
+ // | cuda::atomic_thread_fence | fence.sc.<scope>; |
+ // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | |
+ // |------------------------------------------------------|-------------------------------|
+ // | cuda::atomic_load | fence.sc.<scope>; |
+ // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | ld.acquire.<scope>; |
+ // |------------------------------------------------------|-------------------------------|
+ // | cuda::atomic_store | fence.sc.<scope>; |
+ // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | st.release.<scope>; |
+ // |------------------------------------------------------|-------------------------------|
+ // | cuda::atomic_fetch_<op> | fence.sc.<scope>; |
+ // | (memory_order_seq_cst, cuda::thread_scope_<scope>) | atom.acq_rel.<scope>; |
// clang-format on
@@ -892,11 +899,11 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
//
// This sets the ordering of the fence to SequentiallyConsistent, and
// sets the corresponding ordering for the instruction.
- NVPTX::Ordering ord;
+ NVPTX::Ordering InstrOrder;
if (N->readMem()) {
- ord = NVPTX::Ordering::Acquire;
+ InstrOrder = NVPTX::Ordering::Acquire;
} else if (N->writeMem()) {
- ord = NVPTX::Ordering::Release;
+ InstrOrder = NVPTX::Ordering::Release;
} else {
SmallString<256> Msg;
raw_svector_ostream OS(Msg);
@@ -907,7 +914,7 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
report_fatal_error(OS.str());
}
return AddrGenericOrGlobalOrShared
- ? OperationOrderings(ord,
+ ? OperationOrderings(InstrOrder,
NVPTX::Ordering::SequentiallyConsistent)
: OperationOrderings(NVPTX::Ordering::NotAtomic);
}
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
index 2a74b24084434..cd6b565053300 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -1002,5 +1002,3 @@ define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, pt
ret void
}
-
-
>From 4eaef95f045d8ddb2c23dbaffbea39b03396ce78 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Tue, 16 Jul 2024 01:00:37 -0700
Subject: [PATCH 6/8] [NVPTX] Cleanup SeqCst Load/Store
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 109 +++++++++++----
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 144 ++++++++++----------
2 files changed, 158 insertions(+), 95 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index a75d050ff7ecc..e18b0e0fb3d9f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1145,6 +1145,26 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
+ // If a fence is required before the operation, insert it:
+ SDValue Chain = N->getOperand(0);
+ switch (NVPTX::Ordering(FenceOrdering)) {
+ case NVPTX::Ordering::NotAtomic:
+ break;
+ case NVPTX::Ordering::SequentiallyConsistent: {
+ unsigned Op = Subtarget->hasMemoryOrdering()
+ ? NVPTX::atomic_thread_fence_seq_cst_sys
+ : NVPTX::atomic_thread_fence_seq_cst_sys_membar;
+ Chain = SDValue(CurDAG->getMachineNode(Op, dl, MVT::Other, Chain), 0);
+ break;
+ }
+ default:
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering)
+ << "\".";
+ report_fatal_error(OS.str());
+ }
+
// Type Setting: fromType + fromTypeWidth
//
// Sign : ISD::SEXTLOAD
@@ -1172,7 +1192,6 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
fromType = getLdStRegType(ScalarVT);
// Create the machine instruction DAG
- SDValue Chain = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDValue Addr;
SDValue Offset, Base;
@@ -1185,8 +1204,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
- getI32Imm(InstructionOrdering, dl),
+ SDValue Ops[] = {getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1201,8 +1219,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
- getI32Imm(InstructionOrdering, dl),
+ SDValue Ops[] = {getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1224,8 +1241,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
- getI32Imm(InstructionOrdering, dl),
+ SDValue Ops[] = {getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1246,8 +1262,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(FenceOrdering, dl),
- getI32Imm(InstructionOrdering, dl),
+ SDValue Ops[] = {getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
getI32Imm(fromType, dl),
@@ -1294,6 +1309,25 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
auto [InstructionOrdering, FenceOrdering] =
getOperationOrderings(MemSD, Subtarget);
+ // If a fence is required before the operation, insert it:
+ switch (NVPTX::Ordering(FenceOrdering)) {
+ case NVPTX::Ordering::NotAtomic:
+ break;
+ case NVPTX::Ordering::SequentiallyConsistent: {
+ unsigned Op = Subtarget->hasMemoryOrdering()
+ ? NVPTX::atomic_thread_fence_seq_cst_sys
+ : NVPTX::atomic_thread_fence_seq_cst_sys_membar;
+ Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
+ break;
+ }
+ default:
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering)
+ << "\".";
+ report_fatal_error(OS.str());
+ }
+
// Vector Setting
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1359,8 +1393,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
- getI32Imm(InstructionOrdering, DL),
+ SDValue Ops[] = {getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1389,8 +1422,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
- getI32Imm(InstructionOrdering, DL),
+ SDValue Ops[] = {getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1440,8 +1472,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
- getI32Imm(InstructionOrdering, DL),
+ SDValue Ops[] = {getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1491,8 +1522,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(FenceOrdering, DL),
- getI32Imm(InstructionOrdering, DL),
+ SDValue Ops[] = {getI32Imm(InstructionOrdering, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(VecType, DL),
getI32Imm(FromType, DL),
@@ -1950,6 +1980,26 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
auto [InstructionOrdering, FenceOrdering] =
getOperationOrderings(ST, Subtarget);
+ // If a fence is required before the operation, insert it:
+ SDValue Chain = ST->getChain();
+ switch (NVPTX::Ordering(FenceOrdering)) {
+ case NVPTX::Ordering::NotAtomic:
+ break;
+ case NVPTX::Ordering::SequentiallyConsistent: {
+ unsigned Op = Subtarget->hasMemoryOrdering()
+ ? NVPTX::atomic_thread_fence_seq_cst_sys
+ : NVPTX::atomic_thread_fence_seq_cst_sys_membar;
+ Chain = SDValue(CurDAG->getMachineNode(Op, dl, MVT::Other, Chain), 0);
+ break;
+ }
+ default:
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering)
+ << "\".";
+ report_fatal_error(OS.str());
+ }
+
// Vector Setting
MVT SimpleVT = StoreVT.getSimpleVT();
unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
@@ -1969,7 +2019,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
unsigned int toType = getLdStRegType(ScalarVT);
// Create the machine instruction DAG
- SDValue Chain = ST->getChain();
SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
SDValue BasePtr = ST->getBasePtr();
SDValue Addr;
@@ -1985,7 +2034,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(FenceOrdering, dl),
getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -2003,7 +2051,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(FenceOrdering, dl),
getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -2029,7 +2076,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
return false;
SDValue Ops[] = {Value,
- getI32Imm(FenceOrdering, dl),
getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -2052,7 +2098,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!Opcode)
return false;
SDValue Ops[] = {Value,
- getI32Imm(FenceOrdering, dl),
getI32Imm(InstructionOrdering, dl),
getI32Imm(CodeAddrSpace, dl),
getI32Imm(vecType, dl),
@@ -2096,6 +2141,25 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
auto [InstructionOrdering, FenceOrdering] =
getOperationOrderings(MemSD, Subtarget);
+ // If a fence is required before the operation, insert it:
+ switch (NVPTX::Ordering(FenceOrdering)) {
+ case NVPTX::Ordering::NotAtomic:
+ break;
+ case NVPTX::Ordering::SequentiallyConsistent: {
+ unsigned Op = Subtarget->hasMemoryOrdering()
+ ? NVPTX::atomic_thread_fence_seq_cst_sys
+ : NVPTX::atomic_thread_fence_seq_cst_sys_membar;
+ Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
+ break;
+ }
+ default:
+ SmallString<256> Msg;
+ raw_svector_ostream OS(Msg);
+ OS << "Unexpected fence ordering: \"" << NVPTX::Ordering(FenceOrdering)
+ << "\".";
+ report_fatal_error(OS.str());
+ }
+
// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'
assert(StoreVT.isSimple() && "Store value is not simple");
@@ -2136,7 +2200,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
ToTypeWidth = 32;
}
- StOps.push_back(getI32Imm(FenceOrdering, DL));
StOps.push_back(getI32Imm(InstructionOrdering, DL));
StOps.push_back(getI32Imm(CodeAddrSpace, DL));
StOps.push_back(getI32Imm(VecType, DL));
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index c2cf0c8fc05e9..b685949c5eb79 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2958,39 +2958,39 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
multiclass LD<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _areg_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr];", []>;
def _ari : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _ari_64 : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
def _asi : NVPTXInst<
(outs regclass:$dst),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t$dst, [$addr+$offset];", []>;
}
@@ -3006,39 +3006,39 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST<NVPTXRegClass regclass> {
def _avar : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, imem:$addr),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr], $src;", []>;
def _ari : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int32Regs:$addr, i32imm:$offset),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, Int64Regs:$addr, i32imm:$offset),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
def _asi : NVPTXInst<
(outs),
- (ins regclass:$src, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
+ (ins regclass:$src, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec,
LdStCode:$Sign, i32imm:$toWidth, imem:$addr, i32imm:$offset),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
" \t[$addr+$offset], $src;", []>;
}
@@ -3057,75 +3057,75 @@ let mayStore=1, hasSideEffects=0 in {
multiclass LD_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr];", []>;
def _v2_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v2_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2}}, [$addr+$offset];", []>;
def _v4_avar : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_areg_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
def _v4_ari : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_ari_64 : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
def _v4_asi : NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ (ins LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "${sc:sc}ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr+$offset];", []>;
}
let mayLoad=1, hasSideEffects=0 in {
@@ -3140,84 +3140,84 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST_VEC<NVPTXRegClass regclass> {
def _v2_avar : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_areg_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2}};", []>;
def _v2_ari : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int32Regs:$addr,
i32imm:$offset),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_ari_64 : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, Int64Regs:$addr,
i32imm:$offset),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v2_asi : NVPTXInst<
(outs),
- (ins regclass:$src1, regclass:$src2, LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp,
+ (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$addsp,
LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, imem:$addr,
i32imm:$offset),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2}};", []>;
def _v4_avar : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_areg_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int32Regs:$addr, i32imm:$offset),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_ari_64 : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, Int64Regs:$addr, i32imm:$offset),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
"\t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
def _v4_asi : NVPTXInst<
(outs),
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
- LdStCode:$sc, LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
+ LdStCode:$sem, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
i32imm:$fromWidth, imem:$addr, i32imm:$offset),
- "${sc:sc}st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}"
+ "st${sem:sem}${addsp:addsp}${Vec:vec}.${Sign:sign}"
"$fromWidth \t[$addr+$offset], {{$src1, $src2, $src3, $src4}};", []>;
}
>From d955de0998cca17485d908a3af5319cb06537e2f Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Tue, 16 Jul 2024 05:14:21 -0700
Subject: [PATCH 7/8] [NVPTX] Add vector load/store tests and refactor
load/store tests
---
llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 895 ++++++++------------
llvm/test/CodeGen/NVPTX/load-store.ll | 735 ++++++++++++++--
2 files changed, 1020 insertions(+), 610 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
index cd6b565053300..4d3b11094d3a1 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll
@@ -1,166 +1,90 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s
; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
-; CHECK-LABEL: generic_plain
-define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
- ; CHECK: ld.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load i8, ptr %a
- %a.add = add i8 %a.load, 1
- ; CHECK: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i8 %a.add, ptr %a
-
- ; CHECK: ld.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load i16, ptr %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i16 %b.add, ptr %b
-
- ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load i32, ptr %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store i32 %c.add, ptr %c
-
- ; CHECK: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load i64, ptr %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store i64 %d.add, ptr %d
-
- ; CHECK: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load float, ptr %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store float %e.add, ptr %c
-
- ; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load double, ptr %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store double %f.add, ptr %c
-
- ret void
-}
-
-; CHECK-LABEL: generic_volatile
-define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
- ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load volatile i8, ptr %a
- %a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i8 %a.add, ptr %a
-
- ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load volatile i16, ptr %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i16 %b.add, ptr %b
-
- ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load volatile i32, ptr %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile i32 %c.add, ptr %c
-
- ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load volatile i64, ptr %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store volatile i64 %d.add, ptr %d
-
- ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load volatile float, ptr %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store volatile float %e.add, ptr %c
-
- ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load volatile double, ptr %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store volatile double %f.add, ptr %c
+;; generic statespace
- ret void
-}
-
-; CHECK-LABEL: generic_monotonic
-define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
- ; CHECK: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr %a monotonic, align 1
+; CHECK-LABEL: generic_acq_rel
+define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr %a acquire, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr %a monotonic, align 1
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr %a release, align 1
- ; CHECK: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr %b monotonic, align 2
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr %b acquire, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr %b monotonic, align 2
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr %b release, align 2
- ; CHECK: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr %c monotonic, align 4
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr %c acquire, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr %c monotonic, align 4
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr %c release, align 4
- ; CHECK: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr %d monotonic, align 8
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr %d acquire, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr %d monotonic, align 8
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr %d release, align 8
- ; CHECK: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr %e monotonic, align 4
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr %e acquire, align 4
%e.add = fadd float %e.load, 1.0
- ; CHECK: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr %e monotonic, align 4
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr %e release, align 4
- ; CHECK: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr %e monotonic, align 8
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr %e acquire, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr %e monotonic, align 8
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr %e release, align 8
+
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
-; CHECK-LABEL: generic_acq_rel
-define void @generic_acq_rel(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+; CHECK-LABEL: generic_acq_rel_volatile
+define void @generic_acq_rel_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr %a acquire, align 1
+ %a.load = load atomic volatile i8, ptr %a acquire, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr %a release, align 1
+ store atomic volatile i8 %a.add, ptr %a release, align 1
; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr %b acquire, align 2
+ %b.load = load atomic volatile i16, ptr %b acquire, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr %b release, align 2
+ store atomic volatile i16 %b.add, ptr %b release, align 2
; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr %c acquire, align 4
+ %c.load = load atomic volatile i32, ptr %c acquire, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr %c release, align 4
+ store atomic volatile i32 %c.add, ptr %c release, align 4
; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr %d acquire, align 8
+ %d.load = load atomic volatile i64, ptr %d acquire, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr %d release, align 8
+ store atomic volatile i64 %d.add, ptr %d release, align 8
; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr %e acquire, align 4
+ %e.load = load atomic volatile float, ptr %e acquire, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr %e release, align 4
+ store atomic volatile float %e.add, ptr %e release, align 4
; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr %e acquire, align 8
+ %f.load = load atomic volatile double, ptr %e acquire, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr %e release, align 8
+ store atomic volatile double %f.add, ptr %e release, align 8
+
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
@@ -215,216 +139,68 @@ define void @generic_sc(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_ad
; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic double %f.add, ptr %e seq_cst, align 8
- ret void
-}
-
-; CHECK-LABEL: generic_monotonic_volatile
-define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
- ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr %a monotonic, align 1
- %a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr %a monotonic, align 1
-
- ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr %b monotonic, align 2
- %b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr %b monotonic, align 2
-
- ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr %c monotonic, align 4
- %c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr %c monotonic, align 4
-
- ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr %d monotonic, align 8
- %d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr %d monotonic, align 8
-
- ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr %e monotonic, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr %e monotonic, align 4
-
- ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr %e monotonic, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr %e monotonic, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
-;; global statespace
-
-; CHECK-LABEL: global_plain
-define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
- ; CHECK: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load i8, ptr addrspace(1) %a
- %a.add = add i8 %a.load, 1
- ; CHECK: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i8 %a.add, ptr addrspace(1) %a
-
- ; CHECK: ld.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load i16, ptr addrspace(1) %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i16 %b.add, ptr addrspace(1) %b
-
- ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load i32, ptr addrspace(1) %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store i32 %c.add, ptr addrspace(1) %c
-
- ; CHECK: ld.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load i64, ptr addrspace(1) %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store i64 %d.add, ptr addrspace(1) %d
-
- ; CHECK: ld.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load float, ptr addrspace(1) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store float %e.add, ptr addrspace(1) %c
-
- ; CHECK: ld.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load double, ptr addrspace(1) %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store double %f.add, ptr addrspace(1) %c
-
- ret void
-}
-
-; CHECK-LABEL: global_volatile
-define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d) local_unnamed_addr {
- ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load volatile i8, ptr addrspace(1) %a
- %a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i8 %a.add, ptr addrspace(1) %a
-
- ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load volatile i16, ptr addrspace(1) %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i16 %b.add, ptr addrspace(1) %b
-
- ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load volatile i32, ptr addrspace(1) %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile i32 %c.add, ptr addrspace(1) %c
-
- ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load volatile i64, ptr addrspace(1) %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store volatile i64 %d.add, ptr addrspace(1) %d
-
- ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load volatile float, ptr addrspace(1) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store volatile float %e.add, ptr addrspace(1) %c
-
- ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load volatile double, ptr addrspace(1) %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store volatile double %f.add, ptr addrspace(1) %c
-
- ret void
-}
-
-; CHECK-LABEL: global_monotonic
-define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; CHECK: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
+; CHECK-LABEL: generic_sc_volatile
+define void @generic_sc_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr %a seq_cst, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr %a seq_cst, align 1
- ; CHECK: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr %b seq_cst, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr %b seq_cst, align 2
- ; CHECK: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr %c seq_cst, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr %c seq_cst, align 4
- ; CHECK: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr %d seq_cst, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr %d seq_cst, align 8
- ; CHECK: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr %e seq_cst, align 4
%e.add = fadd float %e.load, 1.0
- ; CHECK: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr %e seq_cst, align 4
- ; CHECK: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
-
- ret void
-}
-
-; CHECK-LABEL: global_monotonic_volatile
-define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
- %a.add = add i8 %a.load, 1
- ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
-
- ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
- %b.add = add i16 %b.load, 1
- ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
-
- ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
- %c.add = add i32 %c.load, 1
- ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
-
- ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
- %d.add = add i64 %d.load, 1
- ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
-
- ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
- %e.add = fadd float %e.load, 1.0
- ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr %e seq_cst, align 8
- ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
+;; global statespace
+
; CHECK-LABEL: global_acq_rel
define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
@@ -463,6 +239,8 @@ define void @global_acq_rel(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrsp
; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic double %f.add, ptr addrspace(1) %e release, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
+
ret void
}
@@ -504,175 +282,123 @@ define void @global_acq_rel_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, p
; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8
- ret void
-}
-
-;; shared statespace
-
-; CHECK-LABEL: shared_plain
-define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
- ; CHECK: ld.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load i8, ptr addrspace(3) %a
- %a.add = add i8 %a.load, 1
- ; CHECK: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i8 %a.add, ptr addrspace(3) %a
-
- ; CHECK: ld.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load i16, ptr addrspace(3) %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i16 %b.add, ptr addrspace(3) %b
-
- ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load i32, ptr addrspace(3) %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store i32 %c.add, ptr addrspace(3) %c
-
- ; CHECK: ld.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load i64, ptr addrspace(3) %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store i64 %d.add, ptr addrspace(3) %d
-
- ; CHECK: ld.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load float, ptr addrspace(3) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store float %e.add, ptr addrspace(3) %c
-
- ; CHECK: ld.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load double, ptr addrspace(3) %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store double %f.add, ptr addrspace(3) %c
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
-; CHECK-LABEL: shared_volatile
-define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d) local_unnamed_addr {
- ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load volatile i8, ptr addrspace(3) %a
- %a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i8 %a.add, ptr addrspace(3) %a
-
- ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load volatile i16, ptr addrspace(3) %b
- %b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i16 %b.add, ptr addrspace(3) %b
-
- ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load volatile i32, ptr addrspace(3) %c
- %c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile i32 %c.add, ptr addrspace(3) %c
-
- ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load volatile i64, ptr addrspace(3) %d
- %d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store volatile i64 %d.add, ptr addrspace(3) %d
-
- ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load volatile float, ptr addrspace(3) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store volatile float %e.add, ptr addrspace(3) %c
-
- ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load volatile double, ptr addrspace(3) %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store volatile double %f.add, ptr addrspace(3) %c
-
- ret void
-}
-
-; CHECK-LABEL: shared_monotonic
-define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
+; CHECK-LABEL: global_seq_cst
+define void @global_seq_cst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
- ; CHECK: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
- ; CHECK: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
- ; CHECK: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
- ; CHECK: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4
%e.add = fadd float %e.load, 1.0
- ; CHECK: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4
- ; CHECK: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8
+
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
-; CHECK-LABEL: shared_monotonic_volatile
-define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1
+; CHECK-LABEL: global_seq_cst_volatile
+define void @global_seq_cst_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(3) %a monotonic, align 1
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1
- ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(3) %b monotonic, align 2
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(3) %b monotonic, align 2
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2
- ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(3) %c monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(3) %c monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4
- ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(3) %d monotonic, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(3) %d monotonic, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8
- ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(3) %e monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4
%e.add = fadd float %e.load, 1.0
- ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(3) %e monotonic, align 4
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4
- ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(3) %e monotonic, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8
+
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
+;; shared statespace
+
; CHECK-LABEL: shared_acq_rel
define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
@@ -711,6 +437,8 @@ define void @shared_acq_rel(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrsp
; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic double %f.add, ptr addrspace(3) %e release, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
+
ret void
}
@@ -752,253 +480,302 @@ define void @shared_acq_rel_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, p
; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
+
ret void
}
-;; local statespace
-
-; CHECK-LABEL: local_plain
-define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
- ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load i8, ptr addrspace(5) %a
+; CHECK-LABEL: shared_seq_cst
+define void @shared_seq_cst(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i8 %a.add, ptr addrspace(5) %a
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
- ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load i16, ptr addrspace(5) %b
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store i16 %b.add, ptr addrspace(5) %b
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load i32, ptr addrspace(5) %c
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store i32 %c.add, ptr addrspace(5) %c
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
- ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load i64, ptr addrspace(5) %d
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store i64 %d.add, ptr addrspace(5) %d
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
- ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load float, ptr addrspace(5) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store float %e.add, ptr addrspace(5) %c
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4
- ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load double, ptr addrspace(5) %c
- %f.add = fadd double %f.load, 1.
- ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store double %f.add, ptr addrspace(5) %c
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8
+ %f.add = fadd double %f.load, 1.
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8
+
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
-; CHECK-LABEL: local_volatile
-define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
- ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load volatile i8, ptr addrspace(5) %a
+; CHECK-LABEL: shared_seq_cst_volatile
+define void @shared_seq_cst_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i8 %a.add, ptr addrspace(5) %a
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1
- ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load volatile i16, ptr addrspace(5) %b
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store volatile i16 %b.add, ptr addrspace(5) %b
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2
- ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load volatile i32, ptr addrspace(5) %c
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store volatile i32 %c.add, ptr addrspace(5) %c
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4
- ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load volatile i64, ptr addrspace(5) %d
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store volatile i64 %d.add, ptr addrspace(5) %d
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8
- ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load volatile float, ptr addrspace(5) %c
- %e.add = fadd float %e.load, 1.
- ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store volatile float %e.add, ptr addrspace(5) %c
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4
+ %e.add = fadd float %e.load, 1.0
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4
- ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load volatile double, ptr addrspace(5) %c
+ ; CHECK: fence.sc.sys
+ ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store volatile double %f.add, ptr addrspace(5) %c
+ ; CHECK: fence.sc.sys
+ ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8
+
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
-; CHECK-LABEL: local_monotonic
-define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+;; local statespace
+
+; CHECK-LABEL: local_acq_rel
+define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; TODO: this codegen looses Concurrent Forward Progress
+
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
+ %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ store atomic i8 %a.add, ptr addrspace(5) %a release, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(5) %b monotonic, align 2
+ %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+ store atomic i16 %b.add, ptr addrspace(5) %b release, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(5) %c monotonic, align 4
+ %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+ store atomic i32 %c.add, ptr addrspace(5) %c release, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(5) %d monotonic, align 8
+ %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+ store atomic i64 %d.add, ptr addrspace(5) %d release, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(5) %e monotonic, align 4
+ %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(5) %e monotonic, align 4
+ store atomic float %e.add, ptr addrspace(5) %e release, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(5) %e monotonic, align 8
+ %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
+ store atomic double %f.add, ptr addrspace(5) %e release, align 8
+
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
-; CHECK-LABEL: local_monotonic_volatile
-define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+; CHECK-LABEL: local_acq_rel_volatile
+define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; TODO: this codegen looses Concurrent Forward Progress
+
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(5) %a monotonic, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(5) %b monotonic, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(5) %b monotonic, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(5) %c monotonic, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(5) %c monotonic, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(5) %d monotonic, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(5) %d monotonic, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(5) %e monotonic, align 4
+ %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(5) %e monotonic, align 4
+ store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(5) %e monotonic, align 8
+ %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
+ store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8
+
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
-; CHECK-LABEL: local_acq_rel
-define void @local_acq_rel(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+; CHECK-LABEL: local_seq_cst
+define void @local_seq_cst(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; TODO: this codegen looses Concurrent Forward Progress
+
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1
+ %a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i8 %a.add, ptr addrspace(5) %a release, align 1
+ store atomic i8 %a.add, ptr addrspace(5) %a seq_cst, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2
+ %b.load = load atomic i16, ptr addrspace(5) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic i16 %b.add, ptr addrspace(5) %b release, align 2
+ store atomic i16 %b.add, ptr addrspace(5) %b seq_cst, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4
+ %c.load = load atomic i32, ptr addrspace(5) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic i32 %c.add, ptr addrspace(5) %c release, align 4
+ store atomic i32 %c.add, ptr addrspace(5) %c seq_cst, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8
+ %d.load = load atomic i64, ptr addrspace(5) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic i64 %d.add, ptr addrspace(5) %d release, align 8
+ store atomic i64 %d.add, ptr addrspace(5) %d seq_cst, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4
+ %e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic float %e.add, ptr addrspace(5) %e release, align 4
+ store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8
+ %f.load = load atomic double, ptr addrspace(5) %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic double %f.add, ptr addrspace(5) %e release, align 8
+ store atomic double %f.add, ptr addrspace(5) %e seq_cst, align 8
+
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
-; CHECK-LABEL: local_acq_rel_volatile
-define void @local_acq_rel_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+; CHECK-LABEL: local_seq_cst_volatile
+define void @local_seq_cst_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; TODO: this codegen looses Concurrent Forward Progress
+
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1
+ %a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1
%a.add = add i8 %a.load, 1
; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1
+ store atomic volatile i8 %a.add, ptr addrspace(5) %a seq_cst, align 1
; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
- %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2
+ %b.load = load atomic volatile i16, ptr addrspace(5) %b seq_cst, align 2
%b.add = add i16 %b.load, 1
; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
- store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2
+ store atomic volatile i16 %b.add, ptr addrspace(5) %b seq_cst, align 2
; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
- %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4
+ %c.load = load atomic volatile i32, ptr addrspace(5) %c seq_cst, align 4
%c.add = add i32 %c.load, 1
; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
- store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4
+ store atomic volatile i32 %c.add, ptr addrspace(5) %c seq_cst, align 4
; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8
+ %d.load = load atomic volatile i64, ptr addrspace(5) %d seq_cst, align 8
%d.add = add i64 %d.load, 1
; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
- store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8
+ store atomic volatile i64 %d.add, ptr addrspace(5) %d seq_cst, align 8
; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
- %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4
+ %e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4
%e.add = fadd float %e.load, 1.0
; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
- store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4
+ store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4
; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8
+ %f.load = load atomic volatile double, ptr addrspace(5) %e seq_cst, align 8
%f.add = fadd double %f.load, 1.
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8
+ store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8
+
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
ret void
}
+
+; TODO: missing .const statespace tests
+; TODO: missing .param statespace tests
diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store.ll
index 27065f5eca9f4..5fbaa1e9d498d 100644
--- a/llvm/test/CodeGen/NVPTX/load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -check-prefixes=CHECK,SM60 %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70
+; RUN: %if ptxas-12.2 %{ llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %}
; generic statespace
@@ -36,10 +38,81 @@ define void @generic_plain(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr {
store float %e.add, ptr %c
; CHECK: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
- %f.load = load double, ptr %c
+ %f.load = load double, ptr %d
%f.add = fadd double %f.load, 1.
; CHECK: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
- store double %f.add, ptr %c
+ store double %f.add, ptr %d
+
+ ; TODO: should be combined into single .u16 op
+ ; CHECK: ld.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %h.load = load <2 x i8>, ptr %b
+ %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
+ ; CHECK: st.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store <2 x i8> %h.add, ptr %b
+
+ ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %i.load = load <4 x i8>, ptr %c
+ %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
+ ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store <4 x i8> %i.add, ptr %c
+
+ ; CHECK: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %j.load = load <2 x i16>, ptr %c
+ %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
+ ; CHECK: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store <2 x i16> %j.add, ptr %c
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %k.load = load <4 x i16>, ptr %d
+ %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
+ ; CHECK: st.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store <4 x i16> %k.add, ptr %d
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %l.load = load <2 x i32>, ptr %d
+ %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
+ ; CHECK: st.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
+ store <2 x i32> %l.add, ptr %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %m.load = load <4 x i32>, ptr %d
+ %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
+ ; CHECK: st.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+ store <4 x i32> %m.add, ptr %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %n.load = load <2 x i64>, ptr %d
+ %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
+ ; CHECK: st.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
+ store <2 x i64> %n.add, ptr %d
+
+ ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+ ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %o.load = load <2 x float>, ptr %d
+ %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
+ ; CHECK: st.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
+ store <2 x float> %o.add, ptr %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %p.load = load <4 x float>, ptr %d
+ %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
+ ; CHECK: st.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+ store <4 x float> %p.add, ptr %d
+
+ ; TODO: should be combined into single .b128 op
+ ; CHECK: ld.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %q.load = load <2 x double>, ptr %d
+ %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
+ ; CHECK: st.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
+ store <2 x double> %q.add, ptr %d
ret void
}
@@ -82,47 +155,126 @@ define void @generic_volatile(ptr %a, ptr %b, ptr %c, ptr %d) local_unnamed_addr
; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store volatile double %f.add, ptr %c
+ ; CHECK: ld.volatile.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %h.load = load volatile <2 x i8>, ptr %b
+ %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
+ ; CHECK: st.volatile.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store volatile <2 x i8> %h.add, ptr %b
+
+ ; TODO: should NOT be combined into a single .u32 op
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %i.load = load volatile <4 x i8>, ptr %c
+ %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile <4 x i8> %i.add, ptr %c
+
+ ; TODO: should NOT be combined into a single .u32 op
+ ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %j.load = load volatile <2 x i16>, ptr %c
+ %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
+ ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile <2 x i16> %j.add, ptr %c
+
+ ; CHECK: ld.volatile.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %k.load = load volatile <4 x i16>, ptr %d
+ %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
+ ; CHECK: st.volatile.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store volatile <4 x i16> %k.add, ptr %d
+
+ ; CHECK: ld.volatile.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %l.load = load volatile <2 x i32>, ptr %d
+ %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
+ ; CHECK: st.volatile.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
+ store volatile <2 x i32> %l.add, ptr %d
+
+ ; CHECK: ld.volatile.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %m.load = load volatile <4 x i32>, ptr %d
+ %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
+ ; CHECK: st.volatile.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+ store volatile <4 x i32> %m.add, ptr %d
+
+ ; CHECK: ld.volatile.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %n.load = load volatile <2 x i64>, ptr %d
+ %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
+ ; CHECK: st.volatile.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
+ store volatile <2 x i64> %n.add, ptr %d
+
+ ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+ ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+ ; CHECK: ld.volatile.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %o.load = load volatile <2 x float>, ptr %d
+ %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
+ ; CHECK: st.volatile.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
+ store volatile <2 x float> %o.add, ptr %d
+
+ ; CHECK: ld.volatile.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %p.load = load volatile <4 x float>, ptr %d
+ %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
+ ; CHECK: st.volatile.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+ store volatile <4 x float> %p.add, ptr %d
+
+ ; CHECK: ld.volatile.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %q.load = load volatile <2 x double>, ptr %d
+ %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
+ ; CHECK: st.volatile.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
+ store volatile <2 x double> %q.add, ptr %d
+
ret void
}
; CHECK-LABEL: generic_monotonic
define void @generic_monotonic(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr {
- ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM60: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM70: st.relaxed.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i8 %a.add, ptr %a monotonic, align 1
- ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%b.load = load atomic i16, ptr %b monotonic, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM60: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM70: st.relaxed.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i16 %b.add, ptr %b monotonic, align 2
- ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
%c.load = load atomic i32, ptr %c monotonic, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ ; SM60: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ ; SM70: st.relaxed.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
store atomic i32 %c.add, ptr %c monotonic, align 4
- ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
%d.load = load atomic i64, ptr %d monotonic, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ ; SM60: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ ; SM70: st.relaxed.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
store atomic i64 %d.add, ptr %d monotonic, align 8
- ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr %e monotonic, align 4
%e.add = fadd float %e.load, 1.0
- ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ ; SM60: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ ; SM70: st.relaxed.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr %e monotonic, align 4
- ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
%f.load = load atomic double, ptr %e monotonic, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ ; SM60: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ ; SM70: st.relaxed.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic double %f.add, ptr %e monotonic, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
+
ret void
}
@@ -164,6 +316,8 @@ define void @generic_monotonic_volatile(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e)
; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic volatile double %f.add, ptr %e monotonic, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
+
ret void
}
@@ -207,6 +361,77 @@ define void @global_plain(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspac
; CHECK: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store double %f.add, ptr addrspace(1) %c
+ ; TODO: should be combined into single .u16 op
+ ; CHECK: ld.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %h.load = load <2 x i8>, ptr addrspace(1) %b
+ %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
+ ; CHECK: st.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store <2 x i8> %h.add, ptr addrspace(1) %b
+
+ ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %i.load = load <4 x i8>, ptr addrspace(1) %c
+ %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
+ ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store <4 x i8> %i.add, ptr addrspace(1) %c
+
+ ; CHECK: ld.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %j.load = load <2 x i16>, ptr addrspace(1) %c
+ %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
+ ; CHECK: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store <2 x i16> %j.add, ptr addrspace(1) %c
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %k.load = load <4 x i16>, ptr addrspace(1) %d
+ %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
+ ; CHECK: st.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store <4 x i16> %k.add, ptr addrspace(1) %d
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %l.load = load <2 x i32>, ptr addrspace(1) %d
+ %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
+ ; CHECK: st.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
+ store <2 x i32> %l.add, ptr addrspace(1) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %m.load = load <4 x i32>, ptr addrspace(1) %d
+ %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
+ ; CHECK: st.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+ store <4 x i32> %m.add, ptr addrspace(1) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %n.load = load <2 x i64>, ptr addrspace(1) %d
+ %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
+ ; CHECK: st.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
+ store <2 x i64> %n.add, ptr addrspace(1) %d
+
+ ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+ ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %o.load = load <2 x float>, ptr addrspace(1) %d
+ %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
+ ; CHECK: st.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
+ store <2 x float> %o.add, ptr addrspace(1) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %p.load = load <4 x float>, ptr addrspace(1) %d
+ %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
+ ; CHECK: st.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+ store <4 x float> %p.add, ptr addrspace(1) %d
+
+ ; TODO: should be combined into single .b128 op
+ ; CHECK: ld.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %q.load = load <2 x double>, ptr addrspace(1) %d
+ %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
+ ; CHECK: st.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
+ store <2 x double> %q.add, ptr addrspace(1) %d
+
ret void
}
@@ -248,88 +473,181 @@ define void @global_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrs
; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store volatile double %f.add, ptr addrspace(1) %c
+ ; CHECK: ld.volatile.global.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %h.load = load volatile <2 x i8>, ptr addrspace(1) %b
+ %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
+ ; CHECK: st.volatile.global.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store volatile<2 x i8> %h.add, ptr addrspace(1) %b
+
+ ; TODO: should NOT be combined into single .u32 op
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %i.load = load volatile <4 x i8>, ptr addrspace(1) %c
+ %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile<4 x i8> %i.add, ptr addrspace(1) %c
+
+ ; TODO: should NOT be combined into single .u32 op
+ ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %j.load = load volatile <2 x i16>, ptr addrspace(1) %c
+ %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
+ ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile<2 x i16> %j.add, ptr addrspace(1) %c
+
+ ; CHECK: ld.volatile.global.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %k.load = load volatile <4 x i16>, ptr addrspace(1) %d
+ %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
+ ; CHECK: st.volatile.global.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store volatile<4 x i16> %k.add, ptr addrspace(1) %d
+
+ ; CHECK: ld.volatile.global.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %l.load = load volatile <2 x i32>, ptr addrspace(1) %d
+ %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
+ ; CHECK: st.volatile.global.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
+ store volatile<2 x i32> %l.add, ptr addrspace(1) %d
+
+ ; CHECK: ld.volatile.global.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %m.load = load volatile <4 x i32>, ptr addrspace(1) %d
+ %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
+ ; CHECK: st.volatile.global.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+ store volatile<4 x i32> %m.add, ptr addrspace(1) %d
+
+ ; CHECK: ld.volatile.global.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %n.load = load volatile <2 x i64>, ptr addrspace(1) %d
+ %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
+ ; CHECK: st.volatile.global.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
+ store volatile<2 x i64> %n.add, ptr addrspace(1) %d
+
+ ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+ ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+ ; CHECK: ld.volatile.global.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %o.load = load volatile <2 x float>, ptr addrspace(1) %d
+ %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
+ ; CHECK: st.volatile.global.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
+ store volatile<2 x float> %o.add, ptr addrspace(1) %d
+
+ ; CHECK: ld.volatile.global.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %p.load = load volatile <4 x float>, ptr addrspace(1) %d
+ %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
+ ; CHECK: st.volatile.global.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+ store volatile<4 x float> %p.add, ptr addrspace(1) %d
+
+ ; CHECK: ld.volatile.global.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %q.load = load volatile <2 x double>, ptr addrspace(1) %d
+ %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
+ ; CHECK: st.volatile.global.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
+ store volatile<2 x double> %q.add, ptr addrspace(1) %d
+
ret void
}
; CHECK-LABEL: global_monotonic
define void @global_monotonic(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM70: st.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i8 %a.add, ptr addrspace(1) %a monotonic, align 1
- ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%b.load = load atomic i16, ptr addrspace(1) %b monotonic, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM70: st.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i16 %b.add, ptr addrspace(1) %b monotonic, align 2
- ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
%c.load = load atomic i32, ptr addrspace(1) %c monotonic, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ ; SM70: st.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
store atomic i32 %c.add, ptr addrspace(1) %c monotonic, align 4
- ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
%d.load = load atomic i64, ptr addrspace(1) %d monotonic, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ ; SM70: st.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
store atomic i64 %d.add, ptr addrspace(1) %d monotonic, align 8
- ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(1) %e monotonic, align 4
%e.add = fadd float %e.load, 1.0
- ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ ; SM70: st.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(1) %e monotonic, align 4
- ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
%f.load = load atomic double, ptr addrspace(1) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ ; SM70: st.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic double %f.add, ptr addrspace(1) %e monotonic, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
+
ret void
}
; CHECK-LABEL: global_monotonic_volatile
define void @global_monotonic_volatile(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr {
- ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM60: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM70: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic volatile i8 %a.add, ptr addrspace(1) %a monotonic, align 1
- ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%b.load = load atomic volatile i16, ptr addrspace(1) %b monotonic, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM60: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM70: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic volatile i16 %b.add, ptr addrspace(1) %b monotonic, align 2
- ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
%c.load = load atomic volatile i32, ptr addrspace(1) %c monotonic, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ ; SM60: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ ; SM70: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
store atomic volatile i32 %c.add, ptr addrspace(1) %c monotonic, align 4
- ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
%d.load = load atomic volatile i64, ptr addrspace(1) %d monotonic, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ ; SM60: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ ; SM70: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
store atomic volatile i64 %d.add, ptr addrspace(1) %d monotonic, align 8
- ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic volatile float, ptr addrspace(1) %e monotonic, align 4
%e.add = fadd float %e.load, 1.0
- ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ ; SM60: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ ; SM70: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic volatile float %e.add, ptr addrspace(1) %e monotonic, align 4
- ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
%f.load = load atomic volatile double, ptr addrspace(1) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ ; SM60: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ ; SM70: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic volatile double %f.add, ptr addrspace(1) %e monotonic, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
+
ret void
}
@@ -373,6 +691,77 @@ define void @shared_plain(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspac
; CHECK: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store double %f.add, ptr addrspace(3) %c
+ ; TODO: should be combined into single .u16 op
+ ; CHECK: ld.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %h.load = load <2 x i8>, ptr addrspace(3) %b
+ %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
+ ; CHECK: st.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store <2 x i8> %h.add, ptr addrspace(3) %b
+
+ ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %i.load = load <4 x i8>, ptr addrspace(3) %c
+ %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
+ ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store <4 x i8> %i.add, ptr addrspace(3) %c
+
+ ; CHECK: ld.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %j.load = load <2 x i16>, ptr addrspace(3) %c
+ %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
+ ; CHECK: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store <2 x i16> %j.add, ptr addrspace(3) %c
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %k.load = load <4 x i16>, ptr addrspace(3) %d
+ %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
+ ; CHECK: st.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store <4 x i16> %k.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %l.load = load <2 x i32>, ptr addrspace(3) %d
+ %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
+ ; CHECK: st.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
+ store <2 x i32> %l.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %m.load = load <4 x i32>, ptr addrspace(3) %d
+ %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
+ ; CHECK: st.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+ store <4 x i32> %m.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %n.load = load <2 x i64>, ptr addrspace(3) %d
+ %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
+ ; CHECK: st.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
+ store <2 x i64> %n.add, ptr addrspace(3) %d
+
+ ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+ ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %o.load = load <2 x float>, ptr addrspace(3) %d
+ %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
+ ; CHECK: st.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
+ store <2 x float> %o.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %p.load = load <4 x float>, ptr addrspace(3) %d
+ %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
+ ; CHECK: st.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+ store <4 x float> %p.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .b128 op
+ ; CHECK: ld.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %q.load = load <2 x double>, ptr addrspace(3) %d
+ %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
+ ; CHECK: st.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
+ store <2 x double> %q.add, ptr addrspace(3) %d
+
ret void
}
@@ -414,47 +803,134 @@ define void @shared_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrs
; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store volatile double %f.add, ptr addrspace(3) %c
+ ; TODO: should be combined into single .u16 op
+ ; CHECK: ld.volatile.shared.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %h.load = load volatile <2 x i8>, ptr addrspace(3) %b
+ %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
+ ; CHECK: st.volatile.shared.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store volatile <2 x i8> %h.add, ptr addrspace(3) %b
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %i.load = load volatile <4 x i8>, ptr addrspace(3) %c
+ %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile <4 x i8> %i.add, ptr addrspace(3) %c
+
+ ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %j.load = load volatile <2 x i16>, ptr addrspace(3) %c
+ %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
+ ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile <2 x i16> %j.add, ptr addrspace(3) %c
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.volatile.shared.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %k.load = load volatile <4 x i16>, ptr addrspace(3) %d
+ %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
+ ; CHECK: st.volatile.shared.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store volatile <4 x i16> %k.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.volatile.shared.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %l.load = load volatile <2 x i32>, ptr addrspace(3) %d
+ %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
+ ; CHECK: st.volatile.shared.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
+ store volatile <2 x i32> %l.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.volatile.shared.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %m.load = load volatile <4 x i32>, ptr addrspace(3) %d
+ %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
+ ; CHECK: st.volatile.shared.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+ store volatile <4 x i32> %m.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.volatile.shared.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %n.load = load volatile <2 x i64>, ptr addrspace(3) %d
+ %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
+ ; CHECK: st.volatile.shared.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
+ store volatile <2 x i64> %n.add, ptr addrspace(3) %d
+
+ ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+ ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.volatile.shared.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %o.load = load volatile <2 x float>, ptr addrspace(3) %d
+ %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
+ ; CHECK: st.volatile.shared.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
+ store volatile <2 x float> %o.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.volatile.shared.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %p.load = load volatile <4 x float>, ptr addrspace(3) %d
+ %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
+ ; CHECK: st.volatile.shared.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+ store volatile <4 x float> %p.add, ptr addrspace(3) %d
+
+ ; TODO: should be combined into single .b128 op
+ ; CHECK: ld.volatile.shared.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %q.load = load volatile <2 x double>, ptr addrspace(3) %d
+ %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
+ ; CHECK: st.volatile.shared.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
+ store volatile <2 x double> %q.add, ptr addrspace(3) %d
+
ret void
}
; CHECK-LABEL: shared_monotonic
define void @shared_monotonic(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr {
- ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; TODO: in some cases it may be valid to optimize .sys.shared to .cta.shared or .cluster.shared.
+
+ ; SM60: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1
%a.add = add i8 %a.load, 1
- ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM60: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM70: st.relaxed.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i8 %a.add, ptr addrspace(3) %a monotonic, align 1
- ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%b.load = load atomic i16, ptr addrspace(3) %b monotonic, align 2
%b.add = add i16 %b.load, 1
- ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM60: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
+ ; SM70: st.relaxed.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}}
store atomic i16 %b.add, ptr addrspace(3) %b monotonic, align 2
- ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
%c.load = load atomic i32, ptr addrspace(3) %c monotonic, align 4
%c.add = add i32 %c.load, 1
- ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ ; SM60: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ ; SM70: st.relaxed.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
store atomic i32 %c.add, ptr addrspace(3) %c monotonic, align 4
- ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}]
%d.load = load atomic i64, ptr addrspace(3) %d monotonic, align 8
%d.add = add i64 %d.load, 1
- ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ ; SM60: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
+ ; SM70: st.relaxed.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}}
store atomic i64 %d.add, ptr addrspace(3) %d monotonic, align 8
- ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}]
%e.load = load atomic float, ptr addrspace(3) %e monotonic, align 4
%e.add = fadd float %e.load, 1.0
- ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ ; SM60: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
+ ; SM70: st.relaxed.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}}
store atomic float %e.add, ptr addrspace(3) %e monotonic, align 4
- ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM60: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
+ ; SM70: ld.relaxed.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}]
%f.load = load atomic double, ptr addrspace(3) %e monotonic, align 8
%f.add = fadd double %f.load, 1.
- ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ ; SM60: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
+ ; SM70: st.relaxed.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic double %f.add, ptr addrspace(3) %e monotonic, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
+
ret void
}
@@ -496,6 +972,8 @@ define void @shared_monotonic_volatile(ptr addrspace(3) %a, ptr addrspace(3) %b,
; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic volatile double %f.add, ptr addrspace(3) %e monotonic, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
+
ret void
}
@@ -539,11 +1017,84 @@ define void @local_plain(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store double %f.add, ptr addrspace(5) %c
+ ; TODO: should be combined into single .u16 op
+ ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %h.load = load <2 x i8>, ptr addrspace(5) %b
+ %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
+ ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store <2 x i8> %h.add, ptr addrspace(5) %b
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %i.load = load <4 x i8>, ptr addrspace(5) %c
+ %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store <4 x i8> %i.add, ptr addrspace(5) %c
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %j.load = load <2 x i16>, ptr addrspace(5) %c
+ %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store <2 x i16> %j.add, ptr addrspace(5) %c
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %k.load = load <4 x i16>, ptr addrspace(5) %d
+ %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
+ ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store <4 x i16> %k.add, ptr addrspace(5) %d
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %l.load = load <2 x i32>, ptr addrspace(5) %d
+ %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
+ ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
+ store <2 x i32> %l.add, ptr addrspace(5) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %m.load = load <4 x i32>, ptr addrspace(5) %d
+ %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
+ ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+ store <4 x i32> %m.add, ptr addrspace(5) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %n.load = load <2 x i64>, ptr addrspace(5) %d
+ %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
+ ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
+ store <2 x i64> %n.add, ptr addrspace(5) %d
+
+ ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+ ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %o.load = load <2 x float>, ptr addrspace(5) %d
+ %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
+ ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
+ store <2 x float> %o.add, ptr addrspace(5) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %p.load = load <4 x float>, ptr addrspace(5) %d
+ %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
+ ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+ store <4 x float> %p.add, ptr addrspace(5) %d
+
+ ; TODO: should be combined into single .b128 op
+ ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %q.load = load <2 x double>, ptr addrspace(5) %d
+ %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
+ ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
+ store <2 x double> %q.add, ptr addrspace(5) %d
+
ret void
}
; CHECK-LABEL: local_volatile
define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d) local_unnamed_addr {
+ ; TODO: this codegen looses Concurrent Forward Progress
+
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load volatile i8, ptr addrspace(5) %a
%a.add = add i8 %a.load, 1
@@ -580,11 +1131,84 @@ define void @local_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrsp
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store volatile double %f.add, ptr addrspace(5) %c
+ ; TODO: should be combined into single .u16 op
+ ; CHECK: ld.local.v2.u8 {%rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %h.load = load volatile <2 x i8>, ptr addrspace(5) %b
+ %h.add = add <2 x i8> %h.load, <i8 1, i8 1>
+ ; CHECK: st.local.v2.u8 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store volatile <2 x i8> %h.add, ptr addrspace(5) %b
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %i.load = load volatile <4 x i8>, ptr addrspace(5) %c
+ %i.add = add <4 x i8> %i.load, <i8 1, i8 1, i8 1, i8 1>
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile <4 x i8> %i.add, ptr addrspace(5) %c
+
+ ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}]
+ %j.load = load volatile <2 x i16>, ptr addrspace(5) %c
+ %j.add = add <2 x i16> %j.load, <i16 1, i16 1>
+ ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}}
+ store volatile <2 x i16> %j.add, ptr addrspace(5) %c
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.local.v4.u16 {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %k.load = load volatile <4 x i16>, ptr addrspace(5) %d
+ %k.add = add <4 x i16> %k.load, <i16 1, i16 1, i16 1, i16 1>
+ ; CHECK: st.local.v4.u16 [%rd{{[0-9]+}}], {%rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}}
+ store volatile <4 x i16> %k.add, ptr addrspace(5) %d
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.local.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %l.load = load volatile <2 x i32>, ptr addrspace(5) %d
+ %l.add = add <2 x i32> %l.load, <i32 1, i32 1>
+ ; CHECK: st.local.v2.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}}
+ store volatile <2 x i32> %l.add, ptr addrspace(5) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.local.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %m.load = load volatile <4 x i32>, ptr addrspace(5) %d
+ %m.add = add <4 x i32> %m.load, <i32 1, i32 1, i32 1, i32 1>
+ ; CHECK: st.local.v4.u32 [%rd{{[0-9]+}}], {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}}
+ store volatile <4 x i32> %m.add, ptr addrspace(5) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.local.v2.u64 {%rd{{[0-9]+}}, %rd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %n.load = load volatile <2 x i64>, ptr addrspace(5) %d
+ %n.add = add <2 x i64> %n.load, <i64 1, i64 1>
+ ; CHECK: st.local.v2.u64 [%rd{{[0-9]+}}], {%rd{{[0-9]+}}, %rd{{[0-9]+}}}
+ store volatile <2 x i64> %n.add, ptr addrspace(5) %d
+
+ ; Note: per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
+ ; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.
+
+ ; TODO: should be combined into single .u64 op
+ ; CHECK: ld.local.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %o.load = load volatile <2 x float>, ptr addrspace(5) %d
+ %o.add = fadd <2 x float> %o.load, <float 1., float 1.>
+ ; CHECK: st.local.v2.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}}
+ store volatile <2 x float> %o.add, ptr addrspace(5) %d
+
+ ; TODO: should be combined into single .b128 op in sm_70+
+ ; CHECK: ld.local.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %p.load = load volatile <4 x float>, ptr addrspace(5) %d
+ %p.add = fadd <4 x float> %p.load, <float 1., float 1., float 1., float 1.>
+ ; CHECK: st.local.v4.f32 [%rd{{[0-9]+}}], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}
+ store volatile <4 x float> %p.add, ptr addrspace(5) %d
+
+ ; TODO: should be combined into single .b128 op
+ ; CHECK: ld.local.v2.f64 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]
+ %q.load = load volatile <2 x double>, ptr addrspace(5) %d
+ %q.add = fadd <2 x double> %q.load, <double 1., double 1.>
+ ; CHECK: st.local.v2.f64 [%rd{{[0-9]+}}], {%fd{{[0-9]+}}, %fd{{[0-9]+}}}
+ store volatile <2 x double> %q.add, ptr addrspace(5) %d
+
ret void
}
; CHECK-LABEL: local_monotonic
define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; TODO: this codegen looses Concurrent Forward Progress
+
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1
%a.add = add i8 %a.load, 1
@@ -621,11 +1245,15 @@ define void @local_monotonic(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrs
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic double %f.add, ptr addrspace(5) %e monotonic, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
+
ret void
}
; CHECK-LABEL: local_monotonic_volatile
define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr {
+ ; TODO: this codegen looses Concurrent Forward Progress
+
; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}]
%a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1
%a.add = add i8 %a.load, 1
@@ -662,5 +1290,10 @@ define void @local_monotonic_volatile(ptr addrspace(5) %a, ptr addrspace(5) %b,
; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}}
store atomic volatile double %f.add, ptr addrspace(5) %e monotonic, align 8
+ ; TODO: LLVM IR Verifier does not support atomics on vector types.
+
ret void
}
+
+; TODO: missing .const statespace tests
+; TODO: missing .param statespace tests
>From 696c71ec8a79de24a0a9a6b4290aa80fc7269028 Mon Sep 17 00:00:00 2001
From: Gonzalo Brito Gadeschi <gonzalob at nvidia.com>
Date: Tue, 16 Jul 2024 07:03:27 -0700
Subject: [PATCH 8/8] [NVPTX] Cleanups - Remove dead code in InstPrinter -
Capitalization and improve Table comments - Update fence->membar lowering
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 12 ------------
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 18 ++++++++++--------
2 files changed, 10 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 45561d5a11238..3a692feb47ed6 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -256,18 +256,6 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
report_fatal_error(OS.str());
break;
}
- } else if (!strcmp(Modifier, "sc")) {
- auto Ordering =
- NVPTX::Ordering(static_cast<NVPTX::OrderingUnderlyingType>(Imm));
- switch (Ordering) {
- // TODO: refactor fence insertion in ISelDagToDag instead of here
- // as part of implementing atomicrmw seq_cst.
- case NVPTX::Ordering::SequentiallyConsistent:
- O << "fence.sc.sys;\n\t";
- break;
- default:
- break;
- }
} else if (!strcmp(Modifier, "addsp")) {
switch (Imm) {
case NVPTX::PTXLdStInstCode::GLOBAL:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index e18b0e0fb3d9f..17cd560931d7a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -717,10 +717,10 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
struct OperationOrderings {
NVPTX::OrderingUnderlyingType InstrOrdering;
NVPTX::OrderingUnderlyingType FenceOrdering;
- OperationOrderings(NVPTX::Ordering o = NVPTX::Ordering::NotAtomic,
- NVPTX::Ordering f = NVPTX::Ordering::NotAtomic)
- : InstrOrdering(static_cast<NVPTX::OrderingUnderlyingType>(o)),
- FenceOrdering(static_cast<NVPTX::OrderingUnderlyingType>(f)) {}
+ OperationOrderings(NVPTX::Ordering O = NVPTX::Ordering::NotAtomic,
+ NVPTX::Ordering F = NVPTX::Ordering::NotAtomic)
+ : InstrOrdering(static_cast<NVPTX::OrderingUnderlyingType>(O)),
+ FenceOrdering(static_cast<NVPTX::OrderingUnderlyingType>(F)) {}
};
static OperationOrderings
@@ -734,6 +734,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
// clang-format off
// Lowering for Load/Store Operations (note: AcquireRelease Loads or Stores error).
+ // Note: uses of Relaxed in the Atomic column of this table refer
+ // to LLVM AtomicOrdering::Monotonic.
//
// | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
// |---------|----------|--------------------|------------|------------------------------|
@@ -1153,7 +1155,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
case NVPTX::Ordering::SequentiallyConsistent: {
unsigned Op = Subtarget->hasMemoryOrdering()
? NVPTX::atomic_thread_fence_seq_cst_sys
- : NVPTX::atomic_thread_fence_seq_cst_sys_membar;
+ : NVPTX::INT_MEMBAR_SYS;
Chain = SDValue(CurDAG->getMachineNode(Op, dl, MVT::Other, Chain), 0);
break;
}
@@ -1316,7 +1318,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
case NVPTX::Ordering::SequentiallyConsistent: {
unsigned Op = Subtarget->hasMemoryOrdering()
? NVPTX::atomic_thread_fence_seq_cst_sys
- : NVPTX::atomic_thread_fence_seq_cst_sys_membar;
+ : NVPTX::INT_MEMBAR_SYS;
Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
break;
}
@@ -1988,7 +1990,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
case NVPTX::Ordering::SequentiallyConsistent: {
unsigned Op = Subtarget->hasMemoryOrdering()
? NVPTX::atomic_thread_fence_seq_cst_sys
- : NVPTX::atomic_thread_fence_seq_cst_sys_membar;
+ : NVPTX::INT_MEMBAR_SYS;
Chain = SDValue(CurDAG->getMachineNode(Op, dl, MVT::Other, Chain), 0);
break;
}
@@ -2148,7 +2150,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
case NVPTX::Ordering::SequentiallyConsistent: {
unsigned Op = Subtarget->hasMemoryOrdering()
? NVPTX::atomic_thread_fence_seq_cst_sys
- : NVPTX::atomic_thread_fence_seq_cst_sys_membar;
+ : NVPTX::INT_MEMBAR_SYS;
Chain = SDValue(CurDAG->getMachineNode(Op, DL, MVT::Other, Chain), 0);
break;
}
More information about the llvm-commits
mailing list