[llvm] Revert "[NVPTX] Add Volta Load/Store Atomics (.relaxed, .acquire, .release) and Volatile (.mmio/.volatile) support" (PR #99695)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 19 12:56:44 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: Jorge Gorbe Moya (slackito)
<details>
<summary>Changes</summary>
Reverts llvm/llvm-project#<!-- -->98022
---
Patch is 108.79 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/99695.diff
7 Files Affected:
- (modified) llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp (+2-26)
- (modified) llvm/lib/Target/NVPTX/NVPTX.h (-8)
- (modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+76-238)
- (modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+72-72)
- (modified) llvm/lib/Target/NVPTX/NVPTXSubtarget.h (+1-6)
- (removed) llvm/test/CodeGen/NVPTX/load-store-sm-70.ll (-951)
- (modified) llvm/test/CodeGen/NVPTX/load-store.ll (+6-547)
``````````diff
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index a004d64c21cc6..380d878c1f532 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -227,33 +227,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
if (Modifier) {
const MCOperand &MO = MI->getOperand(OpNum);
int Imm = (int) MO.getImm();
- if (!strcmp(Modifier, "sem")) {
- switch (Imm) {
- case NVPTX::PTXLdStInstCode::NotAtomic:
- break;
- case NVPTX::PTXLdStInstCode::Volatile:
+ if (!strcmp(Modifier, "volatile")) {
+ if (Imm)
O << ".volatile";
- break;
- case NVPTX::PTXLdStInstCode::Relaxed:
- O << ".relaxed.sys";
- break;
- case NVPTX::PTXLdStInstCode::Acquire:
- O << ".acquire.sys";
- break;
- case NVPTX::PTXLdStInstCode::Release:
- O << ".release.sys";
- break;
- case NVPTX::PTXLdStInstCode::RelaxedMMIO:
- O << ".mmio.relaxed.sys";
- break;
- default:
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "NVPTX LdStCode Printer does not support \"" << Imm
- << "\" sem modifier.";
- report_fatal_error(OS.str());
- break;
- }
} else if (!strcmp(Modifier, "addsp")) {
switch (Imm) {
case NVPTX::PTXLdStInstCode::GLOBAL:
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 3c7167b157025..b0cb24c63c3ce 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -107,14 +107,6 @@ enum LoadStore {
};
namespace PTXLdStInstCode {
-enum MemorySemantic {
- NotAtomic = 0, // PTX calls these: "Weak"
- Volatile = 1,
- Relaxed = 2,
- Acquire = 3,
- Release = 4,
- RelaxedMMIO = 5
-};
enum AddressSpace {
GENERIC = 0,
GLOBAL = 1,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 371ec8596ef63..11193c11ede3b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -714,170 +714,6 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) {
return NVPTX::PTXLdStInstCode::GENERIC;
}
-static unsigned int getCodeMemorySemantic(MemSDNode *N,
- const NVPTXSubtarget *Subtarget) {
- AtomicOrdering Ordering = N->getSuccessOrdering();
- auto CodeAddrSpace = getCodeAddrSpace(N);
-
- bool HasMemoryOrdering = Subtarget->hasMemoryOrdering();
- bool HasRelaxedMMIO = Subtarget->hasRelaxedMMIO();
-
- // TODO: lowering for SequentiallyConsistent Operations: for now, we error.
- // TODO: lowering for AcquireRelease Operations: for now, we error.
- //
-
- // clang-format off
-
- // Lowering for non-SequentiallyConsistent Operations
- //
- // | Atomic | Volatile | Statespace | PTX sm_60- | PTX sm_70+ |
- // |---------|----------|--------------------|------------|------------------------------|
- // | No | No | All | plain | .weak |
- // | No | Yes | Generic,Shared, | .volatile | .volatile |
- // | | | Global [0] | | |
- // | No | Yes | Local,Const,Param | plain [1] | .weak [1] |
- // | Relaxed | No | Generic,Shared, | | |
- // | | | Global [0] | .volatile | <atomic sem> |
- // | Other | No | Generic,Shared, | Error [2] | <atomic sem> |
- // | | | Global [0] | | |
- // | Yes | No | Local,Const,Param | plain [1] | .weak [1] |
- // | Relaxed | Yes | Generic,Shared [0] | .volatile | .volatile |
- // | Relaxed | Yes | Global [0] | .volatile | .mmio.relaxed.sys (PTX 8.2+) |
- // | | | | | or .volatile (PTX 8.1-) |
- // | Relaxed | Yes | Local,Const,Param | plain [1] | .weak [1] |
- // | Other | Yes | Generic, Shared, | Error [2] | <atomic sem> [3] |
- // | | | / Global [0] | | |
-
- // clang-format on
-
- // [0]: volatile and atomics are only supported on global or shared
- // memory locations, accessed via generic/shared/global pointers.
- // MMIO is only supported on global memory locations,
- // accessed via generic/global pointers.
- // TODO: Implement MMIO access via generic pointer to global.
- // Currently implemented for global pointers only.
-
- // [1]: Lowering volatile/atomic operations to non-volatile/non-atomic
- // PTX instructions fails to preserve their C++ side-effects.
- //
- // Example (https://github.com/llvm/llvm-project/issues/62057):
- //
- // void example() {
- // std::atomic<bool> True = true;
- // while (True.load(std::memory_order_relaxed));
- // }
- //
- // A C++ program that calls "example" is well-defined: the infinite loop
- // performs an atomic operation. By lowering volatile/atomics to
- // "weak" memory operations, we are transforming the above into:
- //
- // void undefined_behavior() {
- // bool True = true;
- // while (True);
- // }
- //
- // which exhibits undefined behavior in both C++ and PTX.
- //
- // Calling "example" in CUDA C++ compiled for sm_60- exhibits undefined
- // behavior due to lack of Independent Forward Progress. Lowering these
- // to weak memory operations in sm_60- is therefore fine.
- //
- // TODO: lower atomic and volatile operations to memory locations
- // in local, const, and param to two PTX instructions in sm_70+:
- // - the "weak" memory instruction we are currently lowering to, and
- // - some other instruction that preserves the side-effect, e.g.,
- // a dead dummy volatile load.
-
- if (CodeAddrSpace == NVPTX::PTXLdStInstCode::LOCAL ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::PARAM) {
- return NVPTX::PTXLdStInstCode::NotAtomic;
- }
-
- // [2]: Atomics with Ordering different than Relaxed are not supported on
- // sm_60 and older; this includes volatile atomics.
- if (!(Ordering == AtomicOrdering::NotAtomic ||
- Ordering == AtomicOrdering::Monotonic) &&
- !HasMemoryOrdering) {
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "PTX does not support \"atomic\" for orderings different than"
- "\"NotAtomic\" or \"Monotonic\" for sm_60 or older, but order is: \""
- << toIRString(Ordering) << "\".";
- report_fatal_error(OS.str());
- }
-
- // [3]: TODO: these should eventually use .mmio<.atomic sem>; for now we drop
- // the volatile semantics and preserve the atomic ones.
-
- // PTX volatile and PTX atomics are not available for statespace that differ
- // from .generic, .global, or .shared. The behavior of PTX volatile and PTX
- // atomics is undefined if the generic address does not refer to a .global or
- // .shared memory location.
- bool AddrGenericOrGlobalOrShared =
- (CodeAddrSpace == NVPTX::PTXLdStInstCode::GENERIC ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL ||
- CodeAddrSpace == NVPTX::PTXLdStInstCode::SHARED);
- bool UseRelaxedMMIO =
- HasRelaxedMMIO && CodeAddrSpace == NVPTX::PTXLdStInstCode::GLOBAL;
-
- switch (Ordering) {
- case AtomicOrdering::NotAtomic:
- return N->isVolatile() && AddrGenericOrGlobalOrShared
- ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
- case AtomicOrdering::Monotonic:
- if (N->isVolatile())
- return UseRelaxedMMIO ? NVPTX::PTXLdStInstCode::RelaxedMMIO
- : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
- else
- return HasMemoryOrdering ? NVPTX::PTXLdStInstCode::Relaxed
- : AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Volatile
- : NVPTX::PTXLdStInstCode::NotAtomic;
- case AtomicOrdering::Acquire:
- if (!N->readMem()) {
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "PTX only supports Acquire Ordering on reads: "
- << N->getOperationName();
- N->print(OS);
- report_fatal_error(OS.str());
- }
- return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Acquire
- : NVPTX::PTXLdStInstCode::NotAtomic;
- case AtomicOrdering::Release:
- if (!N->writeMem()) {
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "PTX only supports Release Ordering on writes: "
- << N->getOperationName();
- N->print(OS);
- report_fatal_error(OS.str());
- }
- return AddrGenericOrGlobalOrShared ? NVPTX::PTXLdStInstCode::Release
- : NVPTX::PTXLdStInstCode::NotAtomic;
- case AtomicOrdering::AcquireRelease: {
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "PTX only supports AcquireRelease Ordering on read-modify-write: "
- << N->getOperationName();
- N->print(OS);
- report_fatal_error(OS.str());
- }
- case AtomicOrdering::SequentiallyConsistent:
- case AtomicOrdering::Unordered:
- // TODO: support AcquireRelease and SequentiallyConsistent
- SmallString<256> Msg;
- raw_svector_ostream OS(Msg);
- OS << "NVPTX backend does not support AtomicOrdering \""
- << toIRString(Ordering) << "\" yet.";
- report_fatal_error(OS.str());
- }
-
- llvm_unreachable("unexpected unhandled case");
-}
-
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
unsigned CodeAddrSpace, MachineFunction *F) {
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
@@ -1080,18 +916,32 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
if (!LoadedVT.isSimple())
return false;
+ AtomicOrdering Ordering = LD->getSuccessOrdering();
+ // In order to lower atomic loads with stronger guarantees we would need to
+ // use load.acquire or insert fences. However these features were only added
+ // with PTX ISA 6.0 / sm_70.
+ // TODO: Check if we can actually use the new instructions and implement them.
+ if (isStrongerThanMonotonic(Ordering))
+ return false;
+
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
return tryLDGLDU(N);
}
- // Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(LD, Subtarget);
-
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
+ // Volatile Setting
+ // - .volatile is only available for .global and .shared
+ // - .volatile has the same memory synchronization semantics as .relaxed.sys
+ bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
+ if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ isVolatile = false;
+
// Type Setting: fromType + fromTypeWidth
//
// Sign : ISD::SEXTLOAD
@@ -1132,13 +982,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl),
- Addr,
- Chain};
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl), Addr, Chain };
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
} else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
: SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
@@ -1147,14 +993,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl),
- Base,
- Offset,
- Chain};
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
} else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
: SelectADDRri(N1.getNode(), N1, Base, Offset)) {
@@ -1169,14 +1010,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl),
- Base,
- Offset,
- Chain};
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
} else {
if (PointerSize == 64)
@@ -1190,13 +1026,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, dl),
- getI32Imm(CodeAddrSpace, dl),
- getI32Imm(vecType, dl),
- getI32Imm(fromType, dl),
- getI32Imm(fromTypeWidth, dl),
- N1,
- Chain};
+ SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
+ getI32Imm(vecType, dl), getI32Imm(fromType, dl),
+ getI32Imm(fromTypeWidth, dl), N1, Chain };
NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
}
@@ -1233,8 +1065,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(MemSD->getAddressSpace());
- // Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(MemSD, Subtarget);
+ // Volatile Setting
+ // - .volatile is only availalble for .global and .shared
+ bool IsVolatile = MemSD->isVolatile();
+ if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
+ IsVolatile = false;
// Vector Setting
MVT SimpleVT = LoadedVT.getSimpleVT();
@@ -1301,13 +1138,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
- getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL),
- getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL),
- Addr,
- Chain};
+ SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL), Addr, Chain };
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
} else if (PointerSize == 64
? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
@@ -1330,14 +1163,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
- getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL),
- getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL),
- Base,
- Offset,
- Chain};
+ SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
} else if (PointerSize == 64
? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
@@ -1380,14 +1208,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
- getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL),
- getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL),
- Base,
- Offset,
- Chain};
+ SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
} else {
@@ -1430,13 +1253,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
}
if (!Opcode)
return false;
- SDValue Ops[] = {getI32Imm(CodeMemorySem, DL),
- getI32Imm(CodeAddrSpace, DL),
- getI32Imm(VecType, DL),
- getI32Imm(FromType, DL),
- getI32Imm(FromTypeWidth, DL),
- Op1,
- Chain};
+ SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
+ getI32Imm(VecType, DL), getI32Imm(FromType, DL),
+ getI32Imm(FromTypeWidth, DL), Op1, Chain };
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
}
@@ -1879,13 +1698,27 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
if (!StoreVT.isSimple())
return false;
+ AtomicOrdering Ordering = ST->getSuccessOrdering();
+ // In order to lower atomic loads with stronger guarantees we would need to
+ // use store.release or insert fences. However these features were only added
+ // with PTX ISA 6.0 / sm_70.
+ // TODO: Check if we can actually use the new instructions and implement them.
+ if (isStrongerThanMonotonic(Ordering))
+ return false;
+
// Address Space Setting
unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
unsigned int PointerSize =
CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
- // Memory Semantic Setting
- unsigned int CodeMemorySem = getCodeMemorySemantic(ST, Subtarget);
+ // Volatile Setting
+ // - .volatile is only available for .global and .shared
+ // - .volatile has the same memory synchronization semantics as .relaxed.sys
+ bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
+ if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
+ CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/99695
More information about the llvm-commits
mailing list