[llvm] [NVPTX] Composable and Extensible Memory Cache Control Hints (PR #175901)
Fei Peng via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 8 23:15:34 PST 2026
https://github.com/fiigii updated https://github.com/llvm/llvm-project/pull/175901
>From b0a90fa86e1c9e10f35371ff82f2044578275d84 Mon Sep 17 00:00:00 2001
From: Fei Peng <feip at nvidia.com>
Date: Tue, 13 Jan 2026 23:52:15 -0800
Subject: [PATCH 01/10] [NVPTX] Composable and Extensible Memory Cache Control
Hints
Implement the NVPTX backend support for the !mem.cache_hint metadata
proposal described in:
https://discourse.llvm.org/t/rfc-composable-and-extensible-memory-cache-control-hints-in-llvm-ir/89443
This enables fine-grained cache control on load, store, and memcpy
instructions, lowering to PTX cache qualifiers.
Supported cache hints:
- L1 eviction: L1::evict_first, L1::evict_last, L1::evict_unchanged,
L1::no_allocate (requires SM 70+)
- L2 eviction: L2::evict_first, L2::evict_last (requires SM 70+)
- L2 prefetch: L2::64B, L2::128B (SM 75+), L2::256B (SM 80+)
- L2::cache_hint with 64-bit cache policy descriptor (SM 80+, PTX 7.4+)
The metadata uses a key-value format with operand_no to identify which
pointer operand the hints apply to:
!0 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first"}
For memcpy, operand_no 0 refers to the destination (store) and
operand_no 1 refers to the source (load).
Implementation details:
- Added cache hint enums and encoding/decoding utilities to NVPTX.h
- Extended TargetLowering with recordTargetMMOInfo() hook for metadata
- Modified load/store instruction patterns to include CacheHint operand
- Added printCacheHint() to NVPTXInstPrinter for PTX emission
- SM version guards ensure hints are only emitted when hardware supports
them; unsupported hints are silently dropped
- Per-function cache policy storage in NVPTXTargetMachine for L2::cache_hint
mode which requires an additional 64-bit register operand
---
llvm/include/llvm/CodeGen/TargetLowering.h | 13 +
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 54 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 24 +-
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 79 ++
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.h | 3 +
llvm/lib/Target/NVPTX/NVPTX.h | 124 +++
llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp | 9 +-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 144 +++-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 9 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 39 +
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 13 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 149 ++--
.../Target/NVPTX/NVPTXReplaceImageHandles.cpp | 5 +-
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 17 +
llvm/lib/Target/NVPTX/NVPTXTargetMachine.h | 26 +
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 187 +++++
llvm/lib/Target/NVPTX/NVPTXUtilities.h | 12 +
.../floating-point-immediate-operands.mir | 12 +-
.../CodeGen/NVPTX/cache-hint-sm-version.ll | 346 ++++++++
.../CodeGen/NVPTX/load-store-cache-hint.ll | 773 ++++++++++++++++++
.../NVPTX/machinelicm-no-preheader.mir | 16 +-
llvm/test/CodeGen/NVPTX/memcpy-cache-hint.ll | 406 +++++++++
llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir | 8 +-
23 files changed, 2358 insertions(+), 110 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/cache-hint-sm-version.ll
create mode 100644 llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
create mode 100644 llvm/test/CodeGen/NVPTX/memcpy-cache-hint.ll
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index ada4ffd3bcc89..8cfd1df7eb5a6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -476,6 +476,19 @@ class LLVM_ABI TargetLoweringBase {
MachineMemOperand::Flags
getVPIntrinsicMemOperandFlags(const VPIntrinsic &VPIntrin) const;
+ /// Hook for targets to record additional information about a
+ /// MachineMemOperand after it is created from an IR instruction.
+ /// This is called by SelectionDAGBuilder for load/store instructions
+ /// and during memcpy/memmove lowering.
+ /// Targets can use this to store target-specific cache policies or
+ /// other per-memop metadata in a side table.
+ /// The OperandNo parameter specifies which memory operand of the instruction
+ /// this MMO corresponds to (used for multi-operand instructions like memcpy
+ /// where operand 0 is dest and operand 1 is src).
+ /// The default implementation does nothing.
+ virtual void recordTargetMMOInfo(MachineMemOperand *MMO, const Instruction &I,
+ unsigned OperandNo = 0) const {}
+
virtual bool isSelectSupported(SelectSupportKind /*kind*/) const {
return true;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 302b8059e4df0..a7e033284286f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8767,11 +8767,13 @@ static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
}
}
-static SDValue getMemcpyLoadsAndStores(
- SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
- uint64_t Size, Align Alignment, bool isVol, bool AlwaysInline,
- MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo,
- const AAMDNodes &AAInfo, BatchAAResults *BatchAA) {
+static SDValue
+getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+ SDValue Dst, SDValue Src, uint64_t Size,
+ Align Alignment, bool isVol, bool AlwaysInline,
+ MachinePointerInfo DstPtrInfo,
+ MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo,
+ BatchAAResults *BatchAA, const CallInst *CI = nullptr) {
// Turn a memcpy of undef to nop.
// FIXME: We need to honor volatile even is Src is undef.
if (Src.isUndef())
@@ -8879,10 +8881,20 @@ static SDValue getMemcpyLoadsAndStores(
}
Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
if (Value.getNode()) {
+ // Create store MMO explicitly to allow targets to record cache hints.
+ MachineMemOperand *StoreMMO =
+ MF.getMachineMemOperand(DstPtrInfo.getWithOffset(DstOff),
+ MMOFlags | MachineMemOperand::MOStore,
+ VTSize, Alignment, NewAAInfo);
+ // Call hook for target-specific cache hint recording (operand 0 =
+ // dest).
+ if (CI)
+ TLI.recordTargetMMOInfo(StoreMMO, *CI, /*OperandNo=*/0);
+
Store = DAG.getStore(
Chain, dl, Value,
DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
- DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
+ StoreMMO);
OutChains.push_back(Store);
}
}
@@ -8904,17 +8916,33 @@ static SDValue getMemcpyLoadsAndStores(
if (isConstant)
SrcMMOFlags |= MachineMemOperand::MOInvariant;
+ // Create load MMO explicitly to allow targets to record cache hints.
+ MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
+ SrcPtrInfo.getWithOffset(SrcOff),
+ SrcMMOFlags | MachineMemOperand::MOLoad, VTSize,
+ commonAlignment(*SrcAlign, SrcOff), NewAAInfo);
+ // Call hook for target-specific cache hint recording (operand 1 = src).
+ if (CI)
+ TLI.recordTargetMMOInfo(LoadMMO, *CI, /*OperandNo=*/1);
+
Value = DAG.getExtLoad(
ISD::EXTLOAD, dl, NVT, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
- SrcPtrInfo.getWithOffset(SrcOff), VT,
- commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags, NewAAInfo);
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)), VT,
+ LoadMMO);
OutLoadChains.push_back(Value.getValue(1));
+ // Create store MMO explicitly to allow targets to record cache hints.
+ MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
+ DstPtrInfo.getWithOffset(DstOff),
+ MMOFlags | MachineMemOperand::MOStore, VTSize, Alignment, NewAAInfo);
+ // Call hook for target-specific cache hint recording (operand 0 = dest).
+ if (CI)
+ TLI.recordTargetMMOInfo(StoreMMO, *CI, /*OperandNo=*/0);
+
Store = DAG.getTruncStore(
Chain, dl, Value,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
- DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags, NewAAInfo);
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)), VT,
+ StoreMMO);
OutStoreChains.push_back(Store);
}
SrcOff += VTSize;
@@ -9395,7 +9423,7 @@ SDValue SelectionDAG::getMemcpy(
SDValue Result = getMemcpyLoadsAndStores(
*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment,
- isVol, false, DstPtrInfo, SrcPtrInfo, AAInfo, BatchAA);
+ isVol, false, DstPtrInfo, SrcPtrInfo, AAInfo, BatchAA, CI);
if (Result.getNode())
return Result;
}
@@ -9416,7 +9444,7 @@ SDValue SelectionDAG::getMemcpy(
assert(ConstantSize && "AlwaysInline requires a constant size!");
return getMemcpyLoadsAndStores(
*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment,
- isVol, true, DstPtrInfo, SrcPtrInfo, AAInfo, BatchAA);
+ isVol, true, DstPtrInfo, SrcPtrInfo, AAInfo, BatchAA, CI);
}
checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e191cc5524a14..b3e653c5a8483 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4766,8 +4766,16 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
: MachinePointerInfo();
SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
- SDValue L = DAG.getLoad(MemVTs[i], dl, Root, A, PtrInfo, Alignment,
- MMOFlags, AAInfo, Ranges);
+
+ // Create MMO explicitly so targets can record additional info.
+ MachineFunction &MF = DAG.getMachineFunction();
+ TypeSize Size = MemVTs[i].getStoreSize();
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(PtrInfo, MMOFlags | MachineMemOperand::MOLoad,
+ Size, Alignment, AAInfo, Ranges);
+ TLI.recordTargetMMOInfo(MMO, I);
+
+ SDValue L = DAG.getLoad(MemVTs[i], dl, Root, A, MMO);
Chains[ChainI] = L.getValue(1);
if (MemVTs[i] != ValueVTs[i])
@@ -4909,8 +4917,16 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
SDValue Val = SDValue(Src.getNode(), Src.getResNo() + i);
if (MemVTs[i] != ValueVTs[i])
Val = DAG.getPtrExtOrTrunc(Val, dl, MemVTs[i]);
- SDValue St =
- DAG.getStore(Root, dl, Val, Add, PtrInfo, Alignment, MMOFlags, AAInfo);
+
+ // Create MMO explicitly so targets can record additional info.
+ MachineFunction &MF = DAG.getMachineFunction();
+ TypeSize Size = MemVTs[i].getStoreSize();
+ MachineMemOperand *MMO =
+ MF.getMachineMemOperand(PtrInfo, MMOFlags | MachineMemOperand::MOStore,
+ Size, Alignment, AAInfo);
+ TLI.recordTargetMMOInfo(MMO, I);
+
+ SDValue St = DAG.getStore(Root, dl, Val, Add, MMO);
Chains[ChainI] = St;
}
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 5a5793bc7bc13..df739978ac3ae 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -363,6 +363,85 @@ void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum,
llvm_unreachable(formatv("Unknown Modifier: {}", Modifier).str().c_str());
}
+void NVPTXInstPrinter::printCacheHint(const MCInst *MI, int OpNum,
+ raw_ostream &O, StringRef Modifier) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ unsigned Hint = MO.getImm();
+
+ // If no hint is set, print nothing
+ if (Hint == 0)
+ return;
+
+ // Check if L2::cache_hint mode is active
+ bool IsCacheHintMode = NVPTX::isL2CacheHintMode(Hint);
+
+ if (Modifier == "l1") {
+ // L1 qualifiers can be combined with L2::cache_hint mode
+ auto L1 = NVPTX::decodeL1Eviction(Hint);
+ switch (L1) {
+ case NVPTX::L1Eviction::Normal:
+ return;
+ case NVPTX::L1Eviction::Unchanged:
+ O << ".L1::evict_unchanged";
+ return;
+ case NVPTX::L1Eviction::First:
+ O << ".L1::evict_first";
+ return;
+ case NVPTX::L1Eviction::Last:
+ O << ".L1::evict_last";
+ return;
+ case NVPTX::L1Eviction::NoAllocate:
+ O << ".L1::no_allocate";
+ return;
+ }
+ } else if (Modifier == "l2") {
+ // Print L2 eviction qualifier if present
+ auto L2 = NVPTX::decodeL2Eviction(Hint);
+ switch (L2) {
+ case NVPTX::L2Eviction::Normal:
+ break;
+ case NVPTX::L2Eviction::First:
+ O << ".L2::evict_first";
+ break;
+ case NVPTX::L2Eviction::Last:
+ O << ".L2::evict_last";
+ break;
+ }
+ // In L2::cache_hint mode, also print the cache_hint qualifier
+ if (IsCacheHintMode)
+ O << ".L2::cache_hint";
+ return;
+ } else if (Modifier == "prefetch") {
+ // Prefetch qualifiers can be combined with L2::cache_hint mode
+ auto Prefetch = NVPTX::decodeL2Prefetch(Hint);
+ switch (Prefetch) {
+ case NVPTX::L2Prefetch::None:
+ return;
+ case NVPTX::L2Prefetch::Bytes64:
+ O << ".L2::64B";
+ return;
+ case NVPTX::L2Prefetch::Bytes128:
+ O << ".L2::128B";
+ return;
+ case NVPTX::L2Prefetch::Bytes256:
+ O << ".L2::256B";
+ return;
+ }
+ }
+ // Unknown modifier - silently ignore
+}
+
+void NVPTXInstPrinter::printCachePolicy(const MCInst *MI, int OpNum,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+ // If the operand is a register and valid, print ", $reg"
+ if (MO.isReg() && MO.getReg() != 0) {
+ O << ", ";
+ printRegName(O, MO.getReg());
+ }
+ // If it's an immediate 0 or invalid register, print nothing
+}
+
void NVPTXInstPrinter::printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O,
StringRef Modifier) {
const MCOperand &MO = MI->getOperand(OpNum);
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
index 09d4e6b1f18ed..f1cc1e5ec979d 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
@@ -42,6 +42,9 @@ class NVPTXInstPrinter : public MCInstPrinter {
StringRef Modifier = {});
void printAtomicCode(const MCInst *MI, int OpNum, raw_ostream &O,
StringRef Modifier = {});
+ void printCacheHint(const MCInst *MI, int OpNum, raw_ostream &O,
+ StringRef Modifier = {});
+ void printCachePolicy(const MCInst *MI, int OpNum, raw_ostream &O);
void printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O,
StringRef Modifier = {});
void printMemOperand(const MCInst *MI, int OpNum, raw_ostream &O,
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 210624fbb235c..a1d4e70d1bc26 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -15,16 +15,24 @@
#define LLVM_LIB_TARGET_NVPTX_NVPTX_H
#include "llvm/CodeGen/ISDOpcodes.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Target/TargetMachine.h"
+#include <optional>
+
namespace llvm {
+class Function;
class FunctionPass;
+class MachineMemOperand;
class MachineFunctionPass;
class NVPTXTargetMachine;
class PassRegistry;
+class Value;
namespace NVPTXCC {
enum CondCodes {
@@ -195,6 +203,122 @@ enum AddressSpace : AddressSpaceUnderlyingType {
Param = 101
};
+// Cache hint enums for !mem.cache_hint metadata
+// These correspond to PTX cache control qualifiers
+
+// L1 Eviction Policy - maps to PTX L1::evict_* qualifiers
+enum class L1Eviction : unsigned {
+ Normal = 0, // Default behavior (no qualifier)
+ Unchanged = 1, // L1::evict_unchanged
+ First = 2, // L1::evict_first
+ Last = 3, // L1::evict_last
+ NoAllocate = 4, // L1::no_allocate
+};
+
+// L2 Eviction Policy - maps to PTX L2::evict_* qualifiers
+enum class L2Eviction : unsigned {
+ Normal = 0, // Default behavior (no qualifier)
+ First = 1, // L2::evict_first
+ Last = 2, // L2::evict_last
+};
+
+// L2 Prefetch Size - maps to PTX L2::*B qualifiers
+enum class L2Prefetch : unsigned {
+ None = 0, // No prefetch hint
+ Bytes64 = 1, // L2::64B
+ Bytes128 = 2, // L2::128B
+ Bytes256 = 3, // L2::256B
+};
+
+// Bitfield layout for encoded cache hints:
+// Bits 0-2: L1 Eviction (3 bits, 5 values)
+// Bits 3-4: L2 Eviction (2 bits, 3 values)
+// Bits 5-6: L2 Prefetch (2 bits, 4 values)
+// Bit 7: L2::cache_hint mode flag (when set, use CachePolicy operand)
+// Bits 8-31: Reserved
+constexpr unsigned L1EvictionShift = 0;
+constexpr unsigned L1EvictionMask = 0x7;
+constexpr unsigned L2EvictionShift = 3;
+constexpr unsigned L2EvictionMask = 0x3;
+constexpr unsigned L2PrefetchShift = 5;
+constexpr unsigned L2PrefetchMask = 0x3;
+constexpr unsigned L2CacheHintFlag = 0x80; // Bit 7: L2::cache_hint mode
+
+inline unsigned encodeCacheHint(L1Eviction L1, L2Eviction L2, L2Prefetch P) {
+ return (static_cast<unsigned>(L1) << L1EvictionShift) |
+ (static_cast<unsigned>(L2) << L2EvictionShift) |
+ (static_cast<unsigned>(P) << L2PrefetchShift);
+}
+
+inline L1Eviction decodeL1Eviction(unsigned Hint) {
+ return static_cast<L1Eviction>((Hint >> L1EvictionShift) & L1EvictionMask);
+}
+
+inline L2Eviction decodeL2Eviction(unsigned Hint) {
+ return static_cast<L2Eviction>((Hint >> L2EvictionShift) & L2EvictionMask);
+}
+
+inline L2Prefetch decodeL2Prefetch(unsigned Hint) {
+ return static_cast<L2Prefetch>((Hint >> L2PrefetchShift) & L2PrefetchMask);
+}
+
+inline bool isL2CacheHintMode(unsigned Hint) {
+ return (Hint & L2CacheHintFlag) != 0;
+}
+
+// Cache policy data for a single memory operation.
+// Stored per-MMO to avoid pointer collisions when multiple memops share
+// the same pointer value but have different cache policies.
+struct MMOCachePolicyData {
+ uint64_t
+ Policy; // The 64-bit cache policy value for L2::cache_hint (0 if not set)
+ unsigned CacheHint; // Other cache hints (L1 eviction, L2 eviction, prefetch)
+};
+
+// Per-function cache policy data. Keyed by MachineMemOperand* for direct
+// lookup during instruction selection, ensuring each memop gets its own policy.
+struct FunctionCachePolicyData {
+ DenseMap<MachineMemOperand *, MMOCachePolicyData> MMOMap;
+
+ void clear() { MMOMap.clear(); }
+};
+
+// Operand indices for LD (load) machine instructions.
+// These match the operand order in NVPTXInstrInfo.td LD class.
+// Use LDOp::* for getOperand() (includes def), or subtract 1 for uses()
+// iterator.
+namespace LDOp {
+enum : unsigned {
+ Dst = 0, // Output register (def)
+ Ordering = 1, // Memory ordering (sem)
+ Scope = 2, // Memory scope
+ AddrSpace = 3, // Address space
+ Sign = 4, // Signedness
+ Width = 5, // Load width in bits
+ UsedBytes = 6, // Used bytes mask
+ CacheHint = 7, // Cache hint flags
+ Base = 8, // Base pointer (from ADDR)
+ Offset = 9, // Offset (from ADDR)
+ Policy = 10 // Cache policy register
+};
+} // namespace LDOp
+
+// Operand indices for ST (store) machine instructions.
+// These match the operand order in NVPTXInstrInfo.td ST class.
+namespace STOp {
+enum : unsigned {
+ Value = 0, // Value to store
+ Ordering = 1, // Memory ordering (sem)
+ Scope = 2, // Memory scope
+ AddrSpace = 3, // Address space
+ Width = 4, // Store width in bits
+ CacheHint = 5, // Cache hint flags
+ Base = 6, // Base pointer (from ADDR)
+ Offset = 7, // Offset (from ADDR)
+ Policy = 8 // Cache policy register
+};
+} // namespace STOp
+
namespace PTXLdStInstCode {
enum FromType { Unsigned = 0, Signed, Float, Untyped };
} // namespace PTXLdStInstCode
diff --git a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
index c8b53571c1e59..25eb2da33bbd9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
@@ -96,12 +96,13 @@ static bool eliminateMove(MachineInstr &Mov, const MachineRegisterInfo &MRI,
const MachineOperand *ParamSymbol = Mov.uses().begin();
assert(ParamSymbol->isSymbol());
- constexpr unsigned LDInstBasePtrOpIdx = 6;
- constexpr unsigned LDInstAddrSpaceOpIdx = 2;
+ // uses() iterator skips defs, so subtract 1 from LDOp indices
+ constexpr unsigned LDUsesBasePtrIdx = NVPTX::LDOp::Base - 1;
+ constexpr unsigned LDUsesAddrSpaceIdx = NVPTX::LDOp::AddrSpace - 1;
for (auto *LI : LoadInsts) {
- (LI->uses().begin() + LDInstBasePtrOpIdx)
+ (LI->uses().begin() + LDUsesBasePtrIdx)
->ChangeToES(ParamSymbol->getSymbolName());
- (LI->uses().begin() + LDInstAddrSpaceOpIdx)
+ (LI->uses().begin() + LDUsesAddrSpaceIdx)
->ChangeToImmediate(NVPTX::AddressSpace::Param);
}
return true;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 99982ff3181b3..9b28e2a0967ab 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -69,7 +69,14 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
Scopes = NVPTXScopes(MF.getFunction().getContext());
- return SelectionDAGISel::runOnMachineFunction(MF);
+
+ bool Result = SelectionDAGISel::runOnMachineFunction(MF);
+
+ // Clear per-function cache policy data after instruction selection completes
+ // to prevent memory growth over time.
+ TM.clearCachePolicyData(&MF.getFunction());
+
+ return Result;
}
NVPTX::DivPrecisionLevel
@@ -1110,6 +1117,113 @@ bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
return true;
}
+// Helper to extract cache hint from a MemSDNode via MMO lookup.
+// The cache hint is stored per-MMO by recordTargetMMOInfo().
+static unsigned getCacheHint(const MemSDNode *N, const Function &F,
+ const NVPTXTargetMachine &TM) {
+ MachineMemOperand *MMO = N->getMemOperand();
+ if (!MMO)
+ return 0;
+
+ auto &Data = TM.getCachePolicyData(&F);
+ auto It = Data.MMOMap.find(MMO);
+ if (It == Data.MMOMap.end())
+ return 0;
+
+ return It->second.CacheHint;
+}
+
+// Helper to get cache policy value if present (for L2::cache_hint mode).
+// Returns the 64-bit policy descriptor stored per-MMO.
+static std::optional<uint64_t> getCachePolicy(const MemSDNode *N,
+ const Function &F,
+ const NVPTXTargetMachine &TM) {
+ MachineMemOperand *MMO = N->getMemOperand();
+ if (!MMO)
+ return std::nullopt;
+
+ auto &Data = TM.getCachePolicyData(&F);
+ auto It = Data.MMOMap.find(MMO);
+ if (It == Data.MMOMap.end())
+ return std::nullopt;
+
+ // Only return policy if L2CacheHintFlag is set (indicating policy mode)
+ if (!(It->second.CacheHint & NVPTX::L2CacheHintFlag))
+ return std::nullopt;
+
+ return It->second.Policy;
+}
+
+std::pair<unsigned, SDValue> NVPTXDAGToDAGISel::getCacheHintAndPolicyReg(
+ const MemSDNode *N, unsigned CodeAddrSpace, const SDLoc &DL) {
+ // Extract cache hint from MMO flags
+ unsigned CacheHint = getCacheHint(N, MF->getFunction(), TM);
+ SDValue PolicyReg;
+
+ // Apply SM version guards for cache hints (from PTX ISA documentation):
+ // - L1::evict_* requires SM 70+
+ // - L2::evict_* requires SM 70+
+ // - L2::64B and L2::128B require SM 75+
+ // - L2::256B requires SM 80+
+ // - L2::cache_hint requires SM 80+ and PTX 7.4+
+
+ // Check L1 eviction hint (SM 70+)
+ if (!Subtarget->hasL1EvictionHint()) {
+ CacheHint &= ~(NVPTX::L1EvictionMask << NVPTX::L1EvictionShift);
+ }
+
+ // Check L2 eviction hint (SM 70+)
+ if (!Subtarget->hasL2EvictionHint()) {
+ CacheHint &= ~(NVPTX::L2EvictionMask << NVPTX::L2EvictionShift);
+ }
+
+ // Check L2 prefetch hints (SM 75+ for 64B/128B, SM 80+ for 256B)
+ auto L2Prefetch = NVPTX::decodeL2Prefetch(CacheHint);
+ if (L2Prefetch != NVPTX::L2Prefetch::None) {
+ bool PrefetchSupported = false;
+ switch (L2Prefetch) {
+ case NVPTX::L2Prefetch::Bytes64:
+ PrefetchSupported = Subtarget->hasL2Prefetch64B();
+ break;
+ case NVPTX::L2Prefetch::Bytes128:
+ PrefetchSupported = Subtarget->hasL2Prefetch128B();
+ break;
+ case NVPTX::L2Prefetch::Bytes256:
+ PrefetchSupported = Subtarget->hasL2Prefetch256B();
+ break;
+ default:
+ break;
+ }
+ if (!PrefetchSupported) {
+ // Clear the prefetch bits if not supported
+ CacheHint &= ~(NVPTX::L2PrefetchMask << NVPTX::L2PrefetchShift);
+ }
+ }
+
+ // L2::cache_hint is only supported for global address space.
+ // Clear the flag for non-global address spaces.
+ if (CodeAddrSpace != NVPTX::AddressSpace::Global) {
+ CacheHint &= ~NVPTX::L2CacheHintFlag;
+ } else if (Subtarget->hasL2CacheHint()) {
+ // Check for L2::cache_hint with cache-policy (requires SM 80+ and PTX 7.4+)
+ if (auto CachePolicyVal = getCachePolicy(N, MF->getFunction(), TM)) {
+ SDValue PolicyConst =
+ CurDAG->getTargetConstant(*CachePolicyVal, DL, MVT::i64);
+ PolicyReg = SDValue(
+ CurDAG->getMachineNode(NVPTX::MOV_B64_i, DL, MVT::i64, PolicyConst),
+ 0);
+ }
+ }
+
+ // If no policy or L2::cache_hint not supported, use NOREG and clear flag
+ if (!PolicyReg) {
+ PolicyReg = CurDAG->getRegister(NVPTX::NoRegister, MVT::i64);
+ CacheHint &= ~NVPTX::L2CacheHintFlag;
+ }
+
+ return {CacheHint, PolicyReg};
+}
+
bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
MemSDNode *LD = cast<MemSDNode>(N);
assert(LD->readMem() && "Expected load");
@@ -1152,19 +1266,24 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
FromTypeWidth <= 128 && "Invalid width for load");
- // Create the machine instruction DAG
+ const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
+ const auto [CacheHint, PolicyReg] =
+ getCacheHintAndPolicyReg(LD, CodeAddrSpace, DL);
+
+ // Create the machine instruction DAG
SDValue Ops[] = {getI32Imm(Ordering, DL),
getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(FromType, DL),
getI32Imm(FromTypeWidth, DL),
getI32Imm(UsedBytesMask, DL),
+ getI32Imm(CacheHint, DL),
Base,
Offset,
+ PolicyReg,
Chain};
- const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
const std::optional<unsigned> Opcode =
pickOpcodeForVT(TargetVT, NVPTX::LD_i16, NVPTX::LD_i32, NVPTX::LD_i64);
if (!Opcode)
@@ -1225,6 +1344,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
+ const auto [CacheHint, PolicyReg] =
+ getCacheHintAndPolicyReg(LD, CodeAddrSpace, DL);
const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
SDValue Ops[] = {getI32Imm(Ordering, DL),
getI32Imm(Scope, DL),
@@ -1232,8 +1353,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
getI32Imm(FromType, DL),
getI32Imm(FromTypeWidth, DL),
getI32Imm(UsedBytesMask, DL),
+ getI32Imm(CacheHint, DL),
Base,
Offset,
+ PolicyReg,
Chain};
std::optional<unsigned> Opcode;
@@ -1410,13 +1533,20 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
"Invalid width for store");
const auto [Base, Offset] = selectADDR(ST->getBasePtr(), CurDAG);
+
+ // Extract cache hint and policy register
+ const auto [CacheHint, PolicyReg] =
+ getCacheHintAndPolicyReg(ST, CodeAddrSpace, DL);
+
SDValue Ops[] = {selectPossiblyImm(Value),
getI32Imm(Ordering, DL),
getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(ToTypeWidth, DL),
+ getI32Imm(CacheHint, DL),
Base,
Offset,
+ PolicyReg,
Chain};
const std::optional<unsigned> Opcode =
@@ -1462,10 +1592,14 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
TotalWidth <= 256 && "Invalid width for store");
+ // Extract cache hint and policy register
+ const auto [CacheHint, PolicyReg] =
+ getCacheHintAndPolicyReg(ST, CodeAddrSpace, DL);
+
const auto [Base, Offset] = selectADDR(Addr, CurDAG);
Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
- getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL), Base,
- Offset, Chain});
+ getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL),
+ getI32Imm(CacheHint, DL), Base, Offset, PolicyReg, Chain});
const MVT::SimpleValueType EltVT =
ST->getOperand(1).getSimpleValueType().SimpleTy;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index fcb5700dcb6d4..ab8fafa2d9764 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -105,6 +105,15 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
SDValue getPTXCmpMode(const CondCodeSDNode &CondCode);
SDValue selectPossiblyImm(SDValue V);
+ // Returns the cache hint and policy register for a memory operation.
+ // If L2::cache_hint mode is active (SM 80+, PTX 7.4+, global address space),
+ // returns the updated cache hint with L2CacheHintFlag set and a register
+ // containing the 64-bit policy value. Otherwise returns the original hint
+ // and NOREG.
+ std::pair<unsigned, SDValue> getCacheHintAndPolicyReg(const MemSDNode *N,
+ unsigned CodeAddrSpace,
+ const SDLoc &DL);
+
// Returns the Memory Order and Scope that the PTX memory instruction should
// use, and inserts appropriate fence instruction before the memory
// instruction, if needed to implement the instructions memory order. Required
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 755d270563786..b3d45e68fbdcb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -7346,6 +7346,45 @@ void NVPTXTargetLowering::ReplaceNodeResults(
}
}
+MachineMemOperand::Flags
+NVPTXTargetLowering::getTargetMMOFlags(const Instruction &I) const {
+ // Cache policy info is now stored via recordTargetMMOInfo hook.
+ // This function is kept for compatibility but doesn't need to return
+ // anything special - the actual cache hints are stored per-MMO.
+ return MachineMemOperand::MONone;
+}
+
+void NVPTXTargetLowering::recordTargetMMOInfo(MachineMemOperand *MMO,
+ const Instruction &I,
+ unsigned OperandNo) const {
+ // Check for !mem.cache_hint metadata on memory-accessing instructions.
+ // Supported: LoadInst, StoreInst, and memory intrinsics like memcpy.
+ if (!I.mayReadOrWriteMemory())
+ return;
+
+ // Get cache hint from metadata using the specified operand number.
+ // For load/store: operand_no = 0
+ // For memcpy: operand_no = 0 (dest/store), operand_no = 1 (src/load)
+ unsigned CacheHint = NVPTX::getCacheHintFromMetadata(&I, OperandNo);
+
+ // Check for cache_policy (L2::cache_hint mode)
+ uint64_t CachePolicy = 0;
+ if (auto Policy = NVPTX::getCachePolicyFromMetadata(&I, OperandNo)) {
+ CachePolicy = *Policy;
+ // Set the L2CacheHintFlag to indicate policy mode
+ CacheHint |= NVPTX::L2CacheHintFlag;
+ }
+
+ // If no cache hints, nothing to store
+ if (CacheHint == 0 && CachePolicy == 0)
+ return;
+
+ // Store in per-function map keyed by MMO pointer
+ const Function *F = I.getFunction();
+ auto &Data = nvTM->getCachePolicyData(F);
+ Data.MMOMap[MMO] = {CachePolicy, CacheHint};
+}
+
NVPTXTargetLowering::AtomicExpansionKind
NVPTXTargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const {
Type *Ty = AI->getValOperand()->getType();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 9f35fe1e866fa..3e27a47f2ffd4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -172,6 +172,19 @@ class NVPTXTargetLowering : public TargetLowering {
return AtomicExpansionKind::None;
}
+ /// Return target-specific MachineMemOperand flags for load/store
+ /// instructions. This is used to propagate !mem.cache_hint metadata.
+ MachineMemOperand::Flags
+ getTargetMMOFlags(const Instruction &I) const override;
+
+ /// Record cache policy info for a MachineMemOperand.
+ /// Called by SelectionDAGBuilder after creating an MMO from an IR
+ /// instruction. Stores policy/hints in a per-MMO map for lookup during
+ /// instruction selection. OperandNo specifies which memory operand (for
+ /// memcpy: 0=dest, 1=src).
+ void recordTargetMMOInfo(MachineMemOperand *MMO, const Instruction &I,
+ unsigned OperandNo) const override;
+
AtomicExpansionKind
shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index e5a492fa90fbd..f3125a9b9f381 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1608,6 +1608,8 @@ def AtomicCode : Operand<i32> {
let PrintMethod = "printAtomicCode";
}
+def CacheHint : Operand<i32> { let PrintMethod = "printCacheHint"; }
+
def MmaCode : Operand<i32> {
let PrintMethod = "printMmaCode";
}
@@ -1845,15 +1847,20 @@ def Callseq_End :
//
// Load / Store Handling
//
+// CachePolicy operand for L2::cache_hint support
+// When non-zero, indicates L2::cache_hint mode with the policy register
+def CachePolicy : Operand<i64> { let PrintMethod = "printCachePolicy"; }
+
class LD<NVPTXRegClass regclass>
- : NVPTXInst<
- (outs regclass:$dst),
- (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
- AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
- ADDR:$addr),
- "${usedBytes}"
- "ld${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr];">;
+ : NVPTXInst<(outs regclass:$dst),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+ AtomicCode:$Sign, i32imm:$fromWidth,
+ UsedBytesMask:$usedBytes, CacheHint:$cacheHint, ADDR:$addr,
+ CachePolicy:$policy),
+ "${usedBytes}"
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
+ "cacheHint:l2}${cacheHint:prefetch}.${Sign:sign}$fromWidth "
+ "\t$dst, [$addr]${policy};">;
let mayLoad=1, hasSideEffects=0 in {
def LD_i16 : LD<B16>;
@@ -1862,13 +1869,13 @@ let mayLoad=1, hasSideEffects=0 in {
}
class ST<DAGOperand O>
- : NVPTXInst<
- (outs),
- (ins O:$src,
- AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$toWidth,
- ADDR:$addr),
- "st${sem:sem}${scope:scope}${addsp:addsp}.b$toWidth"
- " \t[$addr], $src;">;
+ : NVPTXInst<(outs),
+ (ins O:$src, AtomicCode:$sem, AtomicCode:$scope,
+ AtomicCode:$addsp, i32imm:$toWidth, CacheHint:$cacheHint,
+ ADDR:$addr, CachePolicy:$policy),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
+ "cacheHint:l2}${cacheHint:prefetch}.b$toWidth"
+ " \t[$addr], $src${policy};">;
let mayStore=1, hasSideEffects=0 in {
def ST_i16 : ST<RI16>;
@@ -1880,33 +1887,41 @@ let mayStore=1, hasSideEffects=0 in {
// elementization happens at the machine instruction level, so the following
// instructions never appear in the DAG.
multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
- def _v2 : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2),
- (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
- AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
- ADDR:$addr),
- "${usedBytes}"
- "ld${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr];">;
- def _v4 : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
- (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
- AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
- ADDR:$addr),
- "${usedBytes}"
- "ld${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];">;
+ def _v2
+ : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+ AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
+ CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
+ "${usedBytes}"
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
+ "cacheHint:l2}${cacheHint:prefetch}.v2.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr]${policy};">;
+ def _v4
+ : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+ AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
+ CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
+ "${usedBytes}"
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
+ "cacheHint:l2}${cacheHint:prefetch}.v4.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr]${policy};">;
if support_v8 then
- def _v8 : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
- regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
- (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
- AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
- ADDR:$addr),
- "${usedBytes}"
- "ld${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, "
- "[$addr];">;
+ def _v8
+ : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4, regclass:$dst5, regclass:$dst6,
+ regclass:$dst7, regclass:$dst8),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+ AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
+ CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
+ "${usedBytes}"
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
+ "cacheHint:l2}${cacheHint:prefetch}.v8.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, "
+ "[$addr]${policy};">;
}
let mayLoad=1, hasSideEffects=0 in {
defm LDV_i16 : LD_VEC<B16>;
@@ -1915,30 +1930,36 @@ let mayLoad=1, hasSideEffects=0 in {
}
multiclass ST_VEC<DAGOperand O, bit support_v8 = false> {
- def _v2 : NVPTXInst<
- (outs),
- (ins O:$src1, O:$src2,
- AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
- ADDR:$addr),
- "st${sem:sem}${scope:scope}${addsp:addsp}.v2.b$fromWidth "
- "\t[$addr], {{$src1, $src2}};">;
- def _v4 : NVPTXInst<
- (outs),
- (ins RegOrSink:$src1, RegOrSink:$src2, RegOrSink:$src3, RegOrSink:$src4,
- AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
- ADDR:$addr),
- "st${sem:sem}${scope:scope}${addsp:addsp}.v4.b$fromWidth "
- "\t[$addr], {{$src1, $src2, $src3, $src4}};">;
+ def _v2
+ : NVPTXInst<(outs),
+ (ins O:$src1, O:$src2, AtomicCode:$sem, AtomicCode:$scope,
+ AtomicCode:$addsp, i32imm:$fromWidth,
+ CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
+ "cacheHint:l2}${cacheHint:prefetch}.v2.b$fromWidth "
+ "\t[$addr], {{$src1, $src2}}${policy};">;
+ def _v4
+ : NVPTXInst<(outs),
+ (ins RegOrSink:$src1, RegOrSink:$src2, RegOrSink:$src3,
+ RegOrSink:$src4, AtomicCode:$sem, AtomicCode:$scope,
+ AtomicCode:$addsp, i32imm:$fromWidth,
+ CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
+ "cacheHint:l2}${cacheHint:prefetch}.v4.b$fromWidth "
+ "\t[$addr], {{$src1, $src2, $src3, $src4}}${policy};">;
if support_v8 then
- def _v8 : NVPTXInst<
- (outs),
- (ins RegOrSink:$src1, RegOrSink:$src2, RegOrSink:$src3, RegOrSink:$src4,
- RegOrSink:$src5, RegOrSink:$src6, RegOrSink:$src7, RegOrSink:$src8,
- AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
- ADDR:$addr),
- "st${sem:sem}${scope:scope}${addsp:addsp}.v8.b$fromWidth "
- "\t[$addr], "
- "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, $src8}};">;
+ def _v8
+ : NVPTXInst<(outs),
+ (ins RegOrSink:$src1, RegOrSink:$src2, RegOrSink:$src3,
+ RegOrSink:$src4, RegOrSink:$src5, RegOrSink:$src6,
+ RegOrSink:$src7, RegOrSink:$src8, AtomicCode:$sem,
+ AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
+ CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
+ "cacheHint:l2}${cacheHint:prefetch}.v8.b$fromWidth "
+ "\t[$addr], "
+ "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, "
+ "$src8}}${policy};">;
}
let mayStore=1, hasSideEffects=0 in {
diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index 4bbf49f93f43b..fadceefe0a00c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -1808,8 +1808,9 @@ bool NVPTXReplaceImageHandles::replaceImageHandle(MachineOperand &Op,
// For CUDA, we preserve the param loads coming from function arguments
return false;
- assert(TexHandleDef.getOperand(7).isSymbol() && "Load is not a symbol!");
- StringRef Sym = TexHandleDef.getOperand(7).getSymbolName();
+ assert(TexHandleDef.getOperand(NVPTX::LDOp::Base).isSymbol() &&
+ "Load is not a symbol!");
+ StringRef Sym = TexHandleDef.getOperand(NVPTX::LDOp::Base).getSymbolName();
InstrsToRemove.insert(&TexHandleDef);
Op.ChangeToES(Sym.data());
MFI->getImageHandleSymbolIndex(Sym);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index aeface20f07f3..88ac19fab7b38 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -120,6 +120,23 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool hasDotInstructions() const {
return SmVersion >= 61 && PTXVersion >= 50;
}
+ // Cache hint SM version requirements (from PTX ISA documentation):
+ //
+ // | Cache Hint | SM Requirement | PTX Requirement |
+ // |-----------------|----------------|-----------------|
+ // | L1::evict_* | SM 70+ | - |
+ // | L2::evict_* | SM 70+ | - |
+ // | L2::64B | SM 75+ | - |
+ // | L2::128B | SM 75+ | - |
+ // | L2::256B | SM 80+ | - |
+ // | L2::cache_hint | SM 80+ | PTX 7.4+ |
+ //
+ bool hasL1EvictionHint() const { return SmVersion >= 70; }
+ bool hasL2EvictionHint() const { return SmVersion >= 70; }
+ bool hasL2Prefetch64B() const { return SmVersion >= 75; }
+ bool hasL2Prefetch128B() const { return SmVersion >= 75; }
+ bool hasL2Prefetch256B() const { return SmVersion >= 80; }
+ bool hasL2CacheHint() const { return SmVersion >= 80 && PTXVersion >= 74; }
// Checks following instructions support:
// - tcgen05.ld/st
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index 118a01a0352f5..3fc009af71a37 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXTARGETMACHINE_H
#define LLVM_LIB_TARGET_NVPTX_NVPTXTARGETMACHINE_H
+#include "NVPTX.h"
#include "NVPTXSubtarget.h"
#include "llvm/CodeGen/CodeGenTargetMachineImpl.h"
#include <optional>
@@ -32,6 +33,18 @@ class NVPTXTargetMachine : public CodeGenTargetMachineImpl {
BumpPtrAllocator StrAlloc;
UniqueStringSaver StrPool;
+ // Per-function cache policy storage for !mem.cache_hint metadata.
+ // Mutable because it's modified during const lowering operations.
+ // Data is keyed by Function* and each function is processed sequentially
+ // through the pipeline, so no synchronization is needed.
+ // IMPORTANT: Data must be cleared after instruction selection completes
+ // via clearCachePolicyData() in NVPTXDAGToDAGISel::runOnMachineFunction().
+ // The unique_ptr ensures cleanup even if clearCachePolicyData is not called,
+ // but explicit clearing prevents unbounded memory growth.
+ mutable DenseMap<const Function *,
+ std::unique_ptr<NVPTX::FunctionCachePolicyData>>
+ CachePolicyData;
+
public:
NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
@@ -76,6 +89,19 @@ class NVPTXTargetMachine : public CodeGenTargetMachineImpl {
std::pair<const Value *, unsigned>
getPredicatedAddrSpace(const Value *V) const override;
+
+ // Cache policy data management for !mem.cache_hint metadata.
+ // These methods are const but modify mutable state.
+ NVPTX::FunctionCachePolicyData &getCachePolicyData(const Function *F) const {
+ auto &Data = CachePolicyData[F];
+ if (!Data)
+ Data = std::make_unique<NVPTX::FunctionCachePolicyData>();
+ return *Data;
+ }
+
+ void clearCachePolicyData(const Function *F) const {
+ CachePolicyData.erase(F);
+ }
}; // NVPTXTargetMachine.
class NVPTXTargetMachine32 : public NVPTXTargetMachine {
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 8e97b422218f7..7dd3fc7f4ffa8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -16,12 +16,14 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/ModRef.h"
#include "llvm/Support/Mutex.h"
#include <cstdint>
@@ -32,6 +34,8 @@
#include <string>
#include <vector>
+#define DEBUG_TYPE "nvptx-utilities"
+
namespace llvm {
namespace {
@@ -377,4 +381,187 @@ bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM) {
!isKernelFunction(*F);
}
+namespace NVPTX {
+
+// Helper to parse L1 eviction policy from metadata string
+static std::optional<L1Eviction> parseL1Eviction(StringRef Str) {
+ return StringSwitch<std::optional<L1Eviction>>(Str)
+ .Case("normal", L1Eviction::Normal)
+ .Case("unchanged", L1Eviction::Unchanged)
+ .Case("first", L1Eviction::First)
+ .Case("last", L1Eviction::Last)
+ .Case("no_allocate", L1Eviction::NoAllocate)
+ .Default(std::nullopt);
+}
+
+// Helper to parse L2 eviction policy from metadata string
+static std::optional<L2Eviction> parseL2Eviction(StringRef Str) {
+ return StringSwitch<std::optional<L2Eviction>>(Str)
+ .Case("normal", L2Eviction::Normal)
+ .Case("first", L2Eviction::First)
+ .Case("last", L2Eviction::Last)
+ .Default(std::nullopt);
+}
+
+// Helper to parse L2 prefetch size from metadata string
+static std::optional<L2Prefetch> parseL2Prefetch(StringRef Str) {
+ return StringSwitch<std::optional<L2Prefetch>>(Str)
+ .Case("64B", L2Prefetch::Bytes64)
+ .Case("128B", L2Prefetch::Bytes128)
+ .Case("256B", L2Prefetch::Bytes256)
+ .Default(std::nullopt);
+}
+
+// Helper to find the metadata node matching a specific operand number.
+// The metadata structure is:
+// !mem.cache_hint = !{!node1, !node2, ...}
+// Each node contains key-value pairs (operand_no can be anywhere):
+// !node = !{!"operand_no", i32 N, !"nvvm.key1", value1, ...}
+// Returns the matching MDNode or nullptr if not found.
+static const MDNode *findCacheHintNode(const MDNode *MD, unsigned OperandNo) {
+ if (!MD)
+ return nullptr;
+
+ for (unsigned i = 0, e = MD->getNumOperands(); i < e; ++i) {
+ const MDNode *Node = dyn_cast<MDNode>(MD->getOperand(i));
+ if (!Node || Node->getNumOperands() < 2) {
+ LLVM_DEBUG(if (Node) dbgs()
+ << "NVPTX: Skipping malformed cache hint node with "
+ << Node->getNumOperands() << " operands\n");
+ continue;
+ }
+
+ // Search for operand_no in the node (can be at any position)
+ std::optional<unsigned> NodeOperandNo;
+ for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
+ const MDString *Key = dyn_cast<MDString>(Node->getOperand(j));
+ if (Key && Key->getString() == "operand_no") {
+ if (auto *OpNoMD =
+ dyn_cast<ConstantAsMetadata>(Node->getOperand(j + 1))) {
+ if (auto *OpNoCI = dyn_cast<ConstantInt>(OpNoMD->getValue()))
+ NodeOperandNo = OpNoCI->getZExtValue();
+ else
+ LLVM_DEBUG(dbgs()
+ << "NVPTX: operand_no value is not ConstantInt\n");
+ } else {
+ LLVM_DEBUG(dbgs()
+ << "NVPTX: operand_no value is not ConstantAsMetadata\n");
+ }
+ break;
+ }
+ }
+
+ if (!NodeOperandNo) {
+ LLVM_DEBUG(dbgs() << "NVPTX: Cache hint node missing operand_no\n");
+ continue;
+ }
+
+ if (*NodeOperandNo == OperandNo)
+ return Node;
+ }
+
+ return nullptr;
+}
+
+unsigned getCacheHintFromMetadata(const Instruction *I, unsigned OperandNo) {
+ if (!I)
+ return 0;
+
+ MDNode *MD = I->getMetadata("mem.cache_hint");
+ const MDNode *Node = findCacheHintNode(MD, OperandNo);
+ if (!Node)
+ return 0;
+
+ L1Eviction L1 = L1Eviction::Normal;
+ L2Eviction L2 = L2Eviction::Normal;
+ L2Prefetch Prefetch = L2Prefetch::None;
+
+ // Parse all key-value pairs from the matching node
+ for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
+ const MDString *Key = dyn_cast<MDString>(Node->getOperand(j));
+ if (!Key) {
+ LLVM_DEBUG(dbgs() << "NVPTX: Cache hint key at index " << j
+ << " is not a string\n");
+ continue;
+ }
+
+ StringRef KeyStr = Key->getString();
+ if (KeyStr == "operand_no")
+ continue; // Already processed by findCacheHintNode
+
+ // For eviction and prefetch hints, value should be a string
+ const MDString *Val = dyn_cast<MDString>(Node->getOperand(j + 1));
+ if (!Val) {
+ // nvvm.l2_cache_hint uses i64, not string - skip here
+ if (KeyStr != "nvvm.l2_cache_hint") {
+ LLVM_DEBUG(dbgs() << "NVPTX: Value for '" << KeyStr
+ << "' is not a string\n");
+ }
+ continue;
+ }
+
+ StringRef ValStr = Val->getString();
+ if (KeyStr == "nvvm.l1_eviction") {
+ if (auto Parsed = parseL1Eviction(ValStr))
+ L1 = *Parsed;
+ else
+ LLVM_DEBUG(dbgs() << "NVPTX: Unknown L1 eviction policy: " << ValStr
+ << "\n");
+ } else if (KeyStr == "nvvm.l2_eviction") {
+ if (auto Parsed = parseL2Eviction(ValStr))
+ L2 = *Parsed;
+ else
+ LLVM_DEBUG(dbgs() << "NVPTX: Unknown L2 eviction policy: " << ValStr
+ << "\n");
+ } else if (KeyStr == "nvvm.l2_prefetch_size") {
+ if (auto Parsed = parseL2Prefetch(ValStr))
+ Prefetch = *Parsed;
+ else
+ LLVM_DEBUG(dbgs() << "NVPTX: Unknown L2 prefetch size: " << ValStr
+ << "\n");
+ }
+ // Unknown keys are silently ignored (may be target-specific extensions)
+ }
+
+ return encodeCacheHint(L1, L2, Prefetch);
+}
+
+std::optional<uint64_t> getCachePolicyFromMetadata(const Instruction *I,
+ unsigned OperandNo) {
+ if (!I)
+ return std::nullopt;
+
+ MDNode *MD = I->getMetadata("mem.cache_hint");
+ const MDNode *Node = findCacheHintNode(MD, OperandNo);
+ if (!Node)
+ return std::nullopt;
+
+ // Look for nvvm.l2_cache_hint in the matching node
+ for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
+ const MDString *Key = dyn_cast<MDString>(Node->getOperand(j));
+ if (!Key || Key->getString() != "nvvm.l2_cache_hint")
+ continue;
+
+ // The value should be an i64 constant
+ auto *ValMD = dyn_cast<ConstantAsMetadata>(Node->getOperand(j + 1));
+ if (!ValMD) {
+ LLVM_DEBUG(dbgs() << "NVPTX: nvvm.l2_cache_hint value is not "
+ "ConstantAsMetadata\n");
+ continue;
+ }
+ auto *ValCI = dyn_cast<ConstantInt>(ValMD->getValue());
+ if (!ValCI) {
+ LLVM_DEBUG(
+ dbgs() << "NVPTX: nvvm.l2_cache_hint value is not ConstantInt\n");
+ continue;
+ }
+
+ return ValCI->getZExtValue();
+ }
+
+ return std::nullopt;
+}
+
+} // namespace NVPTX
+
} // namespace llvm
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 1cdc21cc44d5a..39fc46ca1ec8b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -205,6 +205,18 @@ inline raw_ostream &operator<<(raw_ostream &O, AddressSpace A) {
return O;
}
+/// Parse !mem.cache_hint metadata from an instruction.
+/// Returns the encoded cache hint value, or 0 if no valid metadata is present.
+/// The OperandNo parameter specifies which pointer operand to look for
+/// (for instructions with multiple pointer operands like memcpy).
+unsigned getCacheHintFromMetadata(const Instruction *I, unsigned OperandNo = 0);
+
+/// Returns the L2::cache_hint value from !mem.cache_hint metadata, or
+/// std::nullopt if no nvvm.l2_cache_hint is specified. The value is a 64-bit
+/// constant used with the PTX L2::cache_hint qualifier.
+std::optional<uint64_t> getCachePolicyFromMetadata(const Instruction *I,
+ unsigned OperandNo = 0);
+
} // namespace NVPTX
} // namespace llvm
diff --git a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
index 3158916a3195c..43b485301d2a4 100644
--- a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
+++ b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir
@@ -40,9 +40,9 @@ registers:
- { id: 7, class: b32 }
body: |
bb.0.entry:
- %0 = LD_i32 0, 0, 4, 2, 32, -1, &test_param_0, 0
+ %0 = LD_i32 0, 0, 4, 2, 32, -1, 0, &test_param_0, 0, $noreg
%1 = CVT_f64_f32 %0, 0
- %2 = LD_i32 0, 0, 4, 0, 32, -1, &test_param_1, 0
+ %2 = LD_i32 0, 0, 4, 0, 32, -1, 0, &test_param_1, 0, $noreg
; CHECK: %3:b64 = FADD_rnf64ri %1, double 3.250000e+00
%3 = FADD_rnf64ri %1, double 3.250000e+00
%4 = CVT_f32_f64 %3, 5
@@ -50,7 +50,7 @@ body: |
; CHECK: %6:b32 = FADD_rnf32ri %5, float 6.250000e+00
%6 = FADD_rnf32ri %5, float 6.250000e+00, 0
%7 = FMUL_rnf32rr %6, %4, 0
- ST_i32 %7, 0, 0, 101, 32, &func_retval0, 0 :: (store (s32), addrspace 101)
+ ST_i32 %7, 0, 0, 101, 32, 0, &func_retval0, 0, $noreg :: (store (s32), addrspace 101)
Return
...
---
@@ -66,9 +66,9 @@ registers:
- { id: 7, class: b32 }
body: |
bb.0.entry:
- %0 = LD_i32 0, 0, 4, 2, 32, -1, &test2_param_0, 0
+ %0 = LD_i32 0, 0, 4, 2, 32, -1, 0, &test2_param_0, 0, $noreg
%1 = CVT_f64_f32 %0, 0
- %2 = LD_i32 0, 0, 4, 0, 32, -1, &test2_param_1, 0
+ %2 = LD_i32 0, 0, 4, 0, 32, -1, 0, &test2_param_1, 0, $noreg
; CHECK: %3:b64 = FADD_rnf64ri %1, double 0x7FF8000000000000
%3 = FADD_rnf64ri %1, double 0x7FF8000000000000
%4 = CVT_f32_f64 %3, 5
@@ -76,6 +76,6 @@ body: |
; CHECK: %6:b32 = FADD_rnf32ri %5, float 0x7FF8000000000000
%6 = FADD_rnf32ri %5, float 0x7FF8000000000000, 0
%7 = FMUL_rnf32rr %6, %4, 0
- ST_i32 %7, 0, 0, 101, 32, &func_retval0, 0 :: (store (s32), addrspace 101)
+ ST_i32 %7, 0, 0, 101, 32, 0, &func_retval0, 0, $noreg :: (store (s32), addrspace 101)
Return
...
diff --git a/llvm/test/CodeGen/NVPTX/cache-hint-sm-version.ll b/llvm/test/CodeGen/NVPTX/cache-hint-sm-version.ll
new file mode 100644
index 0000000000000..8f1b03b5fde9e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cache-hint-sm-version.ll
@@ -0,0 +1,346 @@
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_60 -mattr=+ptx74 | FileCheck %s --check-prefix=SM60
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx74 | FileCheck %s --check-prefix=SM70
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_75 -mattr=+ptx74 | FileCheck %s --check-prefix=SM75
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74 | FileCheck %s --check-prefix=SM80
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s --check-prefix=SM80-PTX70
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_86 -mattr=+ptx74 | FileCheck %s --check-prefix=SM86
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s --check-prefix=SM90
+
+; Test SM version requirements for cache hints (from PTX ISA documentation):
+; - L1::evict_* requires SM 70+
+; - L2::evict_* requires SM 70+
+; - L2::64B and L2::128B require SM 75+
+; - L2::256B requires SM 80+
+; - L2::cache_hint requires SM 80+ and PTX 7.4+
+
+;-----------------------------------------------------------------------------
+; L1 eviction - requires SM 70+
+; SM60 should NOT emit L1::evict_first (fall back to plain load)
+; SM70+ should emit L1::evict_first
+;-----------------------------------------------------------------------------
+
+; SM60-LABEL: test_load_l1_first
+; SM60: ld.global.b32
+; SM60-NOT: L1::evict_first
+
+; SM70-LABEL: test_load_l1_first
+; SM70: ld.global.L1::evict_first.b32
+
+; SM75-LABEL: test_load_l1_first
+; SM75: ld.global.L1::evict_first.b32
+
+; SM80-LABEL: test_load_l1_first
+; SM80: ld.global.L1::evict_first.b32
+
+; SM86-LABEL: test_load_l1_first
+; SM86: ld.global.L1::evict_first.b32
+
+; SM90-LABEL: test_load_l1_first
+; SM90: ld.global.L1::evict_first.b32
+define i32 @test_load_l1_first(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !0
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; L2 eviction - requires SM 70+
+; SM60 should NOT emit L2::evict_last (fall back to plain load)
+; SM70+ should emit L2::evict_last
+;-----------------------------------------------------------------------------
+
+; SM60-LABEL: test_load_l2_last
+; SM60: ld.global.b32
+; SM60-NOT: L2::evict_last
+
+; SM70-LABEL: test_load_l2_last
+; SM70: ld.global.L2::evict_last.b32
+
+; SM75-LABEL: test_load_l2_last
+; SM75: ld.global.L2::evict_last.b32
+
+; SM80-LABEL: test_load_l2_last
+; SM80: ld.global.L2::evict_last.b32
+
+; SM86-LABEL: test_load_l2_last
+; SM86: ld.global.L2::evict_last.b32
+
+; SM90-LABEL: test_load_l2_last
+; SM90: ld.global.L2::evict_last.b32
+define i32 @test_load_l2_last(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !1
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; L2::64B prefetch - requires SM 75+
+; SM60/SM70 should NOT emit L2::64B (fall back to plain load)
+; SM75+ should emit L2::64B
+;-----------------------------------------------------------------------------
+
+; SM60-LABEL: test_load_prefetch_64
+; SM60: ld.global.b32
+; SM60-NOT: L2::64B
+
+; SM70-LABEL: test_load_prefetch_64
+; SM70: ld.global.b32
+; SM70-NOT: L2::64B
+
+; SM75-LABEL: test_load_prefetch_64
+; SM75: ld.global.L2::64B.b32
+
+; SM80-LABEL: test_load_prefetch_64
+; SM80: ld.global.L2::64B.b32
+
+; SM86-LABEL: test_load_prefetch_64
+; SM86: ld.global.L2::64B.b32
+
+; SM90-LABEL: test_load_prefetch_64
+; SM90: ld.global.L2::64B.b32
+define i32 @test_load_prefetch_64(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !6
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; L2::128B prefetch - requires SM 75+
+; SM60/SM70 should NOT emit L2::128B (fall back to plain load)
+; SM75+ should emit L2::128B
+;-----------------------------------------------------------------------------
+
+; SM60-LABEL: test_load_prefetch_128
+; SM60: ld.global.b32
+; SM60-NOT: L2::128B
+
+; SM70-LABEL: test_load_prefetch_128
+; SM70: ld.global.b32
+; SM70-NOT: L2::128B
+
+; SM75-LABEL: test_load_prefetch_128
+; SM75: ld.global.L2::128B.b32
+
+; SM80-LABEL: test_load_prefetch_128
+; SM80: ld.global.L2::128B.b32
+
+; SM86-LABEL: test_load_prefetch_128
+; SM86: ld.global.L2::128B.b32
+
+; SM90-LABEL: test_load_prefetch_128
+; SM90: ld.global.L2::128B.b32
+define i32 @test_load_prefetch_128(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !2
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; L2::256B prefetch - requires SM 80+
+; SM60/SM70/SM75 should NOT emit L2::256B (fall back to plain load)
+; SM80+ should emit L2::256B
+;-----------------------------------------------------------------------------
+
+; SM60-LABEL: test_load_prefetch_256
+; SM60: ld.global.b32
+; SM60-NOT: L2::256B
+
+; SM70-LABEL: test_load_prefetch_256
+; SM70: ld.global.b32
+; SM70-NOT: L2::256B
+
+; SM75-LABEL: test_load_prefetch_256
+; SM75: ld.global.b32
+; SM75-NOT: L2::256B
+
+; SM80-LABEL: test_load_prefetch_256
+; SM80: ld.global.L2::256B.b32
+
+; SM86-LABEL: test_load_prefetch_256
+; SM86: ld.global.L2::256B.b32
+
+; SM90-LABEL: test_load_prefetch_256
+; SM90: ld.global.L2::256B.b32
+define i32 @test_load_prefetch_256(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !7
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; L2::cache_hint - requires SM 80+ and PTX 7.4+
+; SM60/SM70/SM75 should NOT emit L2::cache_hint (fall back to plain load)
+; SM80 with PTX < 7.4 should NOT emit L2::cache_hint
+; SM80+ with PTX 7.4+ should emit L2::cache_hint
+;-----------------------------------------------------------------------------
+
+; SM60-LABEL: test_load_cache_hint
+; SM60: ld.global.b32
+; SM60-NOT: L2::cache_hint
+
+; SM70-LABEL: test_load_cache_hint
+; SM70: ld.global.b32
+; SM70-NOT: L2::cache_hint
+
+; SM75-LABEL: test_load_cache_hint
+; SM75: ld.global.b32
+; SM75-NOT: L2::cache_hint
+
+; SM80-LABEL: test_load_cache_hint
+; SM80: mov.b64 [[POLICY:%rd[0-9]+]], 12345
+; SM80: ld.global.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+
+; SM80-PTX70-LABEL: test_load_cache_hint
+; SM80-PTX70: ld.global.b32
+; SM80-PTX70-NOT: L2::cache_hint
+
+; SM86-LABEL: test_load_cache_hint
+; SM86: mov.b64 [[POLICY:%rd[0-9]+]], 12345
+; SM86: ld.global.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+
+; SM90-LABEL: test_load_cache_hint
+; SM90: mov.b64 [[POLICY:%rd[0-9]+]], 12345
+; SM90: ld.global.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+define i32 @test_load_cache_hint(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !3
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; L2::cache_hint combined with L1 eviction on older SM
+; Both hints should be dropped on SM60
+; L1 hint emitted but L2::cache_hint dropped on SM70/SM75
+; Both emitted on SM80+
+;-----------------------------------------------------------------------------
+
+; SM60-LABEL: test_load_cache_hint_with_l1
+; SM60: ld.global.b32
+; SM60-NOT: L1::evict_first
+; SM60-NOT: L2::cache_hint
+
+; SM70-LABEL: test_load_cache_hint_with_l1
+; SM70: ld.global.L1::evict_first.b32
+; SM70-NOT: L2::cache_hint
+
+; SM75-LABEL: test_load_cache_hint_with_l1
+; SM75: ld.global.L1::evict_first.b32
+; SM75-NOT: L2::cache_hint
+
+; SM80-LABEL: test_load_cache_hint_with_l1
+; SM80: mov.b64 [[POLICY:%rd[0-9]+]], 44445
+; SM80: ld.global.L1::evict_first.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+
+; SM80-PTX70-LABEL: test_load_cache_hint_with_l1
+; SM80-PTX70: ld.global.L1::evict_first.b32
+; SM80-PTX70-NOT: L2::cache_hint
+define i32 @test_load_cache_hint_with_l1(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !4
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; L2::128B combined with L1 eviction on older SM
+; Both hints dropped on SM60
+; L1 hint emitted but L2::128B dropped on SM70
+; Both emitted on SM75+
+;-----------------------------------------------------------------------------
+
+; SM60-LABEL: test_load_prefetch_with_l1
+; SM60: ld.global.b32
+; SM60-NOT: L1::evict_first
+; SM60-NOT: L2::128B
+
+; SM70-LABEL: test_load_prefetch_with_l1
+; SM70: ld.global.L1::evict_first.b32
+; SM70-NOT: L2::128B
+
+; SM75-LABEL: test_load_prefetch_with_l1
+; SM75: ld.global.L1::evict_first.L2::128B.b32
+
+; SM80-LABEL: test_load_prefetch_with_l1
+; SM80: ld.global.L1::evict_first.L2::128B.b32
+define i32 @test_load_prefetch_with_l1(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !8
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; Store with L2::cache_hint
+;-----------------------------------------------------------------------------
+
+; SM60-LABEL: test_store_cache_hint
+; SM60: st.global.b32
+; SM60-NOT: L2::cache_hint
+
+; SM70-LABEL: test_store_cache_hint
+; SM70: st.global.b32
+; SM70-NOT: L2::cache_hint
+
+; SM80-LABEL: test_store_cache_hint
+; SM80: mov.b64 [[POLICY:%rd[0-9]+]], 67890
+; SM80: st.global.L2::cache_hint.b32 [{{%rd[0-9]+}}], {{%r[0-9]+}}, [[POLICY]]
+
+; SM80-PTX70-LABEL: test_store_cache_hint
+; SM80-PTX70: st.global.b32
+; SM80-PTX70-NOT: L2::cache_hint
+define void @test_store_cache_hint(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !5
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Store with L1 eviction hint
+;-----------------------------------------------------------------------------
+
+; SM60-LABEL: test_store_l1_no_allocate
+; SM60: st.global.b32
+; SM60-NOT: L1::no_allocate
+
+; SM70-LABEL: test_store_l1_no_allocate
+; SM70: st.global.L1::no_allocate.b32
+
+; SM80-LABEL: test_store_l1_no_allocate
+; SM80: st.global.L1::no_allocate.b32
+define void @test_store_l1_no_allocate(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !9
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Metadata definitions
+;-----------------------------------------------------------------------------
+
+; L1 eviction: first
+!0 = !{!100}
+!100 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first"}
+
+; L2 eviction: last
+!1 = !{!101}
+!101 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"last"}
+
+; L2 prefetch: 128B
+!2 = !{!102}
+!102 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"128B"}
+
+; L2::cache_hint only
+!3 = !{!103}
+!103 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 12345}
+
+; L2::cache_hint + L1 eviction
+!4 = !{!104}
+!104 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 44445, !"nvvm.l1_eviction", !"first"}
+
+; L2::cache_hint for store
+!5 = !{!105}
+!105 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 67890}
+
+; L2 prefetch: 64B
+!6 = !{!106}
+!106 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"64B"}
+
+; L2 prefetch: 256B
+!7 = !{!107}
+!107 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"256B"}
+
+; L2 prefetch: 128B + L1 eviction
+!8 = !{!108}
+!108 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"128B", !"nvvm.l1_eviction", !"first"}
+
+; L1 eviction: no_allocate (for store)
+!9 = !{!109}
+!109 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"no_allocate"}
diff --git a/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll b/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
new file mode 100644
index 0000000000000..13a1b5a2ffd50
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
@@ -0,0 +1,773 @@
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74 | %ptxas-verify %}
+
+; Test !mem.cache_hint metadata lowering to PTX cache qualifiers
+; PTX supports the following cache qualifiers:
+; L1 eviction: L1::evict_first, L1::evict_last, L1::evict_unchanged, L1::no_allocate
+; L2 eviction: L2::evict_first, L2::evict_last
+; L2 prefetch: L2::64B, L2::128B, L2::256B
+
+;-----------------------------------------------------------------------------
+; Basic L1 eviction policies for loads
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_load_l1_first
+; CHECK: ld.global.L1::evict_first.b32
+define i32 @test_load_l1_first(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !0
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_load_l1_last
+; CHECK: ld.global.L1::evict_last.b32
+define i32 @test_load_l1_last(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !1
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_load_l1_unchanged
+; CHECK: ld.global.L1::evict_unchanged.b32
+define i32 @test_load_l1_unchanged(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !2
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_load_l1_no_allocate
+; CHECK: ld.global.L1::no_allocate.b32
+define i32 @test_load_l1_no_allocate(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !3
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; Basic L2 eviction policies for loads
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_load_l2_first
+; CHECK: ld.global.L2::evict_first.b32
+define i32 @test_load_l2_first(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !4
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_load_l2_last
+; CHECK: ld.global.L2::evict_last.b32
+define i32 @test_load_l2_last(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !5
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; L2 prefetch sizes for loads
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_load_prefetch_64
+; CHECK: ld.global.L2::64B.b32
+define i32 @test_load_prefetch_64(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !6
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_load_prefetch_128
+; CHECK: ld.global.L2::128B.b32
+define i32 @test_load_prefetch_128(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !7
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_load_prefetch_256
+; CHECK: ld.global.L2::256B.b32
+define i32 @test_load_prefetch_256(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !8
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; All L1 + L2 combinations for loads
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_load_l1_first_l2_first
+; CHECK: ld.global.L1::evict_first.L2::evict_first.b32
+define i32 @test_load_l1_first_l2_first(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !20
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_load_l1_first_l2_last
+; CHECK: ld.global.L1::evict_first.L2::evict_last.b32
+define i32 @test_load_l1_first_l2_last(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !21
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_load_l1_last_l2_first
+; CHECK: ld.global.L1::evict_last.L2::evict_first.b32
+define i32 @test_load_l1_last_l2_first(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !22
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_load_l1_last_l2_last
+; CHECK: ld.global.L1::evict_last.L2::evict_last.b32
+define i32 @test_load_l1_last_l2_last(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !23
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; L1 + L2 + Prefetch combination for loads
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_load_l1_first_l2_last_prefetch_128
+; CHECK: ld.global.L1::evict_first.L2::evict_last.L2::128B.b32
+define i32 @test_load_l1_first_l2_last_prefetch_128(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !24
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; Basic L1 eviction policies for stores
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_store_l1_first
+; CHECK: st.global.L1::evict_first.b32
+define void @test_store_l1_first(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !0
+ ret void
+}
+
+; CHECK-LABEL: test_store_l1_last
+; CHECK: st.global.L1::evict_last.b32
+define void @test_store_l1_last(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !1
+ ret void
+}
+
+; CHECK-LABEL: test_store_l1_unchanged
+; CHECK: st.global.L1::evict_unchanged.b32
+define void @test_store_l1_unchanged(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !2
+ ret void
+}
+
+; CHECK-LABEL: test_store_l1_no_allocate
+; CHECK: st.global.L1::no_allocate.b32
+define void @test_store_l1_no_allocate(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !3
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Basic L2 eviction policies for stores
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_store_l2_first
+; CHECK: st.global.L2::evict_first.b32
+define void @test_store_l2_first(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !4
+ ret void
+}
+
+; CHECK-LABEL: test_store_l2_last
+; CHECK: st.global.L2::evict_last.b32
+define void @test_store_l2_last(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !5
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; All L1 + L2 combinations for stores
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_store_l1_first_l2_first
+; CHECK: st.global.L1::evict_first.L2::evict_first.b32
+define void @test_store_l1_first_l2_first(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !20
+ ret void
+}
+
+; CHECK-LABEL: test_store_l1_first_l2_last
+; CHECK: st.global.L1::evict_first.L2::evict_last.b32
+define void @test_store_l1_first_l2_last(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !21
+ ret void
+}
+
+; CHECK-LABEL: test_store_l1_last_l2_first
+; CHECK: st.global.L1::evict_last.L2::evict_first.b32
+define void @test_store_l1_last_l2_first(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !22
+ ret void
+}
+
+; CHECK-LABEL: test_store_l1_last_l2_last
+; CHECK: st.global.L1::evict_last.L2::evict_last.b32
+define void @test_store_l1_last_l2_last(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !23
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Different data types - loads
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_load_i16_l1_first
+; CHECK: ld.global.L1::evict_first.b16
+define i16 @test_load_i16_l1_first(ptr addrspace(1) %p) {
+ %v = load i16, ptr addrspace(1) %p, !mem.cache_hint !0
+ ret i16 %v
+}
+
+; CHECK-LABEL: test_load_i64_l1_last
+; CHECK: ld.global.L1::evict_last.b64
+define i64 @test_load_i64_l1_last(ptr addrspace(1) %p) {
+ %v = load i64, ptr addrspace(1) %p, !mem.cache_hint !1
+ ret i64 %v
+}
+
+; CHECK-LABEL: test_load_f32_l2_first
+; CHECK: ld.global.L2::evict_first.b32
+define float @test_load_f32_l2_first(ptr addrspace(1) %p) {
+ %v = load float, ptr addrspace(1) %p, !mem.cache_hint !4
+ ret float %v
+}
+
+; CHECK-LABEL: test_load_f64_l2_last
+; CHECK: ld.global.L2::evict_last.b64
+define double @test_load_f64_l2_last(ptr addrspace(1) %p) {
+ %v = load double, ptr addrspace(1) %p, !mem.cache_hint !5
+ ret double %v
+}
+
+;-----------------------------------------------------------------------------
+; Different data types - stores
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_store_i16_l1_first
+; CHECK: st.global.L1::evict_first.b16
+define void @test_store_i16_l1_first(ptr addrspace(1) %p, i16 %v) {
+ store i16 %v, ptr addrspace(1) %p, !mem.cache_hint !0
+ ret void
+}
+
+; CHECK-LABEL: test_store_i64_l2_last
+; CHECK: st.global.L2::evict_last.b64
+define void @test_store_i64_l2_last(ptr addrspace(1) %p, i64 %v) {
+ store i64 %v, ptr addrspace(1) %p, !mem.cache_hint !5
+ ret void
+}
+
+; CHECK-LABEL: test_store_f32_l1_no_allocate
+; CHECK: st.global.L1::no_allocate.b32
+define void @test_store_f32_l1_no_allocate(ptr addrspace(1) %p, float %v) {
+ store float %v, ptr addrspace(1) %p, !mem.cache_hint !3
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Vector loads with cache hints
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_load_v2i32_l1_first
+; CHECK: ld.global.L1::evict_first.v2.b32
+define <2 x i32> @test_load_v2i32_l1_first(ptr addrspace(1) %p) {
+ %v = load <2 x i32>, ptr addrspace(1) %p, !mem.cache_hint !0
+ ret <2 x i32> %v
+}
+
+; CHECK-LABEL: test_load_v4i32_l2_last
+; CHECK: ld.global.L2::evict_last.v4.b32
+define <4 x i32> @test_load_v4i32_l2_last(ptr addrspace(1) %p) {
+ %v = load <4 x i32>, ptr addrspace(1) %p, !mem.cache_hint !5
+ ret <4 x i32> %v
+}
+
+; CHECK-LABEL: test_load_v2f32_l1_unchanged
+; CHECK: ld.global.L1::evict_unchanged.v2.b32
+define <2 x float> @test_load_v2f32_l1_unchanged(ptr addrspace(1) %p) {
+ %v = load <2 x float>, ptr addrspace(1) %p, !mem.cache_hint !2
+ ret <2 x float> %v
+}
+
+; CHECK-LABEL: test_load_v2f64_prefetch_128
+; CHECK: ld.global.L2::128B.v2.b64
+define <2 x double> @test_load_v2f64_prefetch_128(ptr addrspace(1) %p) {
+ %v = load <2 x double>, ptr addrspace(1) %p, !mem.cache_hint !7
+ ret <2 x double> %v
+}
+
+;-----------------------------------------------------------------------------
+; Vector stores with cache hints
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_store_v2i32_l1_last
+; CHECK: st.global.L1::evict_last.v2.b32
+define void @test_store_v2i32_l1_last(ptr addrspace(1) %p, <2 x i32> %v) {
+ store <2 x i32> %v, ptr addrspace(1) %p, !mem.cache_hint !1
+ ret void
+}
+
+; CHECK-LABEL: test_store_v4i32_l2_first
+; CHECK: st.global.L2::evict_first.v4.b32
+define void @test_store_v4i32_l2_first(ptr addrspace(1) %p, <4 x i32> %v) {
+ store <4 x i32> %v, ptr addrspace(1) %p, !mem.cache_hint !4
+ ret void
+}
+
+; CHECK-LABEL: test_store_v2f64_l1_no_allocate
+; CHECK: st.global.L1::no_allocate.v2.b64
+define void @test_store_v2f64_l1_no_allocate(ptr addrspace(1) %p, <2 x double> %v) {
+ store <2 x double> %v, ptr addrspace(1) %p, !mem.cache_hint !3
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; No hint should produce plain load/store
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_load_no_hint
+; CHECK: ld.global.b32
+; CHECK-NOT: L1::
+; CHECK-NOT: L2::
+define i32 @test_load_no_hint(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_store_no_hint
+; CHECK: st.global.b32
+; CHECK-NOT: L1::
+; CHECK-NOT: L2::
+define void @test_store_no_hint(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; L2::cache_hint with constant cache-policy operand (metadata-based)
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_load_cache_hint_i32
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 12345
+; CHECK: ld.global.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+define i32 @test_load_cache_hint_i32(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !30
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_load_cache_hint_i64
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 98765
+; CHECK: ld.global.L2::cache_hint.b64 {{%rd[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+define i64 @test_load_cache_hint_i64(ptr addrspace(1) %p) {
+ %v = load i64, ptr addrspace(1) %p, !mem.cache_hint !31
+ ret i64 %v
+}
+
+; CHECK-LABEL: test_load_cache_hint_f32
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 55555
+; CHECK: ld.global.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+define float @test_load_cache_hint_f32(ptr addrspace(1) %p) {
+ %v = load float, ptr addrspace(1) %p, !mem.cache_hint !32
+ ret float %v
+}
+
+; CHECK-LABEL: test_store_cache_hint_i32
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 67890
+; CHECK: st.global.L2::cache_hint.b32 [{{%rd[0-9]+}}], {{%r[0-9]+}}, [[POLICY]]
+define void @test_store_cache_hint_i32(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !33
+ ret void
+}
+
+; CHECK-LABEL: test_store_cache_hint_i64
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 11111
+; CHECK: st.global.L2::cache_hint.b64 [{{%rd[0-9]+}}], {{%rd[0-9]+}}, [[POLICY]]
+define void @test_store_cache_hint_i64(ptr addrspace(1) %p, i64 %v) {
+ store i64 %v, ptr addrspace(1) %p, !mem.cache_hint !34
+ ret void
+}
+
+; CHECK-LABEL: test_store_cache_hint_f32
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 22222
+; CHECK: st.global.L2::cache_hint.b32 [{{%rd[0-9]+}}], {{%r[0-9]+}}, [[POLICY]]
+define void @test_store_cache_hint_f32(ptr addrspace(1) %p, float %v) {
+ store float %v, ptr addrspace(1) %p, !mem.cache_hint !35
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; L2::cache_hint with vector types
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_load_cache_hint_v2i32
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 33333
+; CHECK: ld.global.L2::cache_hint.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [{{%rd[0-9]+}}], [[POLICY]]
+define <2 x i32> @test_load_cache_hint_v2i32(ptr addrspace(1) %p) {
+ %v = load <2 x i32>, ptr addrspace(1) %p, !mem.cache_hint !40
+ ret <2 x i32> %v
+}
+
+; CHECK-LABEL: test_load_cache_hint_v4i32
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 44444
+; CHECK: ld.global.L2::cache_hint.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [{{%rd[0-9]+}}], [[POLICY]]
+define <4 x i32> @test_load_cache_hint_v4i32(ptr addrspace(1) %p) {
+ %v = load <4 x i32>, ptr addrspace(1) %p, !mem.cache_hint !41
+ ret <4 x i32> %v
+}
+
+; CHECK-LABEL: test_load_cache_hint_v2i64
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 55556
+; CHECK: ld.global.L2::cache_hint.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [{{%rd[0-9]+}}], [[POLICY]]
+define <2 x i64> @test_load_cache_hint_v2i64(ptr addrspace(1) %p) {
+ %v = load <2 x i64>, ptr addrspace(1) %p, !mem.cache_hint !42
+ ret <2 x i64> %v
+}
+
+; CHECK-LABEL: test_load_cache_hint_v2f32
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 66666
+; CHECK: ld.global.L2::cache_hint.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [{{%rd[0-9]+}}], [[POLICY]]
+define <2 x float> @test_load_cache_hint_v2f32(ptr addrspace(1) %p) {
+ %v = load <2 x float>, ptr addrspace(1) %p, !mem.cache_hint !43
+ ret <2 x float> %v
+}
+
+; CHECK-LABEL: test_load_cache_hint_v2f64
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 77777
+; CHECK: ld.global.L2::cache_hint.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [{{%rd[0-9]+}}], [[POLICY]]
+define <2 x double> @test_load_cache_hint_v2f64(ptr addrspace(1) %p) {
+ %v = load <2 x double>, ptr addrspace(1) %p, !mem.cache_hint !44
+ ret <2 x double> %v
+}
+
+; CHECK-LABEL: test_store_cache_hint_v2i32
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 88888
+; CHECK: st.global.L2::cache_hint.v2.b32 [{{%rd[0-9]+}}], {{{%r[0-9]+}}, {{%r[0-9]+}}}, [[POLICY]]
+define void @test_store_cache_hint_v2i32(ptr addrspace(1) %p, <2 x i32> %v) {
+ store <2 x i32> %v, ptr addrspace(1) %p, !mem.cache_hint !45
+ ret void
+}
+
+; CHECK-LABEL: test_store_cache_hint_v4i32
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 99999
+; CHECK: st.global.L2::cache_hint.v4.b32 [{{%rd[0-9]+}}], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [[POLICY]]
+define void @test_store_cache_hint_v4i32(ptr addrspace(1) %p, <4 x i32> %v) {
+ store <4 x i32> %v, ptr addrspace(1) %p, !mem.cache_hint !46
+ ret void
+}
+
+; CHECK-LABEL: test_store_cache_hint_v2i64
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 11112
+; CHECK: st.global.L2::cache_hint.v2.b64 [{{%rd[0-9]+}}], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[POLICY]]
+define void @test_store_cache_hint_v2i64(ptr addrspace(1) %p, <2 x i64> %v) {
+ store <2 x i64> %v, ptr addrspace(1) %p, !mem.cache_hint !47
+ ret void
+}
+
+; CHECK-LABEL: test_store_cache_hint_v2f32
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 22223
+; CHECK: st.global.L2::cache_hint.v2.b32 [{{%rd[0-9]+}}], {{{%r[0-9]+}}, {{%r[0-9]+}}}, [[POLICY]]
+define void @test_store_cache_hint_v2f32(ptr addrspace(1) %p, <2 x float> %v) {
+ store <2 x float> %v, ptr addrspace(1) %p, !mem.cache_hint !48
+ ret void
+}
+
+; CHECK-LABEL: test_store_cache_hint_v2f64
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 33334
+; CHECK: st.global.L2::cache_hint.v2.b64 [{{%rd[0-9]+}}], {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [[POLICY]]
+define void @test_store_cache_hint_v2f64(ptr addrspace(1) %p, <2 x double> %v) {
+ store <2 x double> %v, ptr addrspace(1) %p, !mem.cache_hint !49
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; L2::cache_hint combined with other hints (L2::cache_hint takes precedence)
+;-----------------------------------------------------------------------------
+
+; L2::cache_hint + L1 eviction: both qualifiers should be emitted
+; CHECK-LABEL: test_load_cache_hint_with_l1
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 44445
+; CHECK: ld.global.L1::evict_first.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+define i32 @test_load_cache_hint_with_l1(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !50
+ ret i32 %v
+}
+
+; L2::cache_hint + L2 eviction: both qualifiers should be emitted
+; CHECK-LABEL: test_load_cache_hint_with_l2_eviction
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 55557
+; CHECK: ld.global.L2::evict_last.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+define i32 @test_load_cache_hint_with_l2_eviction(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !51
+ ret i32 %v
+}
+
+; L2::cache_hint + L2 prefetch: both qualifiers should be emitted
+; CHECK-LABEL: test_load_cache_hint_with_prefetch
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 66667
+; CHECK: ld.global.L2::cache_hint.L2::128B.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+define i32 @test_load_cache_hint_with_prefetch(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !52
+ ret i32 %v
+}
+
+; L2::cache_hint + all other hints: all qualifiers emitted
+; CHECK-LABEL: test_load_cache_hint_with_all
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 77778
+; CHECK: ld.global.L1::evict_last.L2::evict_first.L2::cache_hint.L2::256B.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+define i32 @test_load_cache_hint_with_all(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !53
+ ret i32 %v
+}
+
+; Store: L2::cache_hint + L1 eviction
+; CHECK-LABEL: test_store_cache_hint_with_l1
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 88889
+; CHECK: st.global.L1::evict_unchanged.L2::cache_hint.b32 [{{%rd[0-9]+}}], {{%r[0-9]+}}, [[POLICY]]
+define void @test_store_cache_hint_with_l1(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !54
+ ret void
+}
+
+; Store: L2::cache_hint + L1 + L2 eviction (all qualifiers emitted)
+; CHECK-LABEL: test_store_cache_hint_with_all
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 99990
+; CHECK: st.global.L1::no_allocate.L2::evict_last.L2::cache_hint.b32 [{{%rd[0-9]+}}], {{%r[0-9]+}}, [[POLICY]]
+define void @test_store_cache_hint_with_all(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !55
+ ret void
+}
+
+; Vector load: L2::cache_hint + L1 eviction
+; CHECK-LABEL: test_load_cache_hint_v2i32_with_l1
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 11113
+; CHECK: ld.global.L1::evict_first.L2::cache_hint.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [{{%rd[0-9]+}}], [[POLICY]]
+define <2 x i32> @test_load_cache_hint_v2i32_with_l1(ptr addrspace(1) %p) {
+ %v = load <2 x i32>, ptr addrspace(1) %p, !mem.cache_hint !56
+ ret <2 x i32> %v
+}
+
+; Vector store: L2::cache_hint + L1 + L2 eviction + prefetch (all qualifiers emitted)
+; CHECK-LABEL: test_store_cache_hint_v2i32_with_all
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 22224
+; CHECK: st.global.L1::evict_last.L2::evict_first.L2::cache_hint.L2::64B.v2.b32 [{{%rd[0-9]+}}], {{{%r[0-9]+}}, {{%r[0-9]+}}}, [[POLICY]]
+define void @test_store_cache_hint_v2i32_with_all(ptr addrspace(1) %p, <2 x i32> %v) {
+ store <2 x i32> %v, ptr addrspace(1) %p, !mem.cache_hint !57
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Multiple loads sharing same pointer - each gets its own policy
+;-----------------------------------------------------------------------------
+
+; Two volatile loads from the same pointer - each load gets its own cache policy.
+; The per-MMO storage ensures no policy collisions when multiple memops share
+; the same pointer operand but have different cache policies.
+; CHECK-LABEL: test_multiple_loads_same_ptr
+; CHECK-DAG: mov.b64 [[POLICY1:%rd[0-9]+]], 11111
+; CHECK-DAG: mov.b64 [[POLICY2:%rd[0-9]+]], 22222
+; CHECK-DAG: ld.volatile.global.L1::evict_last.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY1]]
+; CHECK-DAG: ld.volatile.global.L1::evict_first.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY2]]
+define i32 @test_multiple_loads_same_ptr(ptr addrspace(1) %p) {
+ %v1 = load volatile i32, ptr addrspace(1) %p, !mem.cache_hint !60
+ %v2 = load volatile i32, ptr addrspace(1) %p, !mem.cache_hint !61
+ %sum = add i32 %v1, %v2
+ ret i32 %sum
+}
+
+;-----------------------------------------------------------------------------
+; Invalid/edge cases
+;-----------------------------------------------------------------------------
+
+; Test with invalid operand_no - should be ignored
+; CHECK-LABEL: test_load_invalid_operand_no
+; CHECK: ld.global.b32
+; CHECK-NOT: L1::
+define i32 @test_load_invalid_operand_no(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !11
+ ret i32 %v
+}
+
+; Test with unknown key - should be ignored, but valid L1 hint should still work
+; CHECK-LABEL: test_load_unknown_key
+; CHECK: ld.global.L1::evict_first.b32
+define i32 @test_load_unknown_key(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !12
+ ret i32 %v
+}
+
+; Test with reordered metadata (operand_no not first) - should still work
+; CHECK-LABEL: test_load_reordered_metadata
+; CHECK: ld.global.L1::evict_last.L2::evict_first.b32
+define i32 @test_load_reordered_metadata(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !70
+ ret i32 %v
+}
+
+; Test "normal" eviction - should not emit any qualifier (default behavior)
+; CHECK-LABEL: test_load_l1_normal
+; CHECK: ld.global.b32
+; CHECK-NOT: L1::evict_normal
+define i32 @test_load_l1_normal(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !13
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_load_l2_normal
+; CHECK: ld.global.b32
+; CHECK-NOT: L2::evict_normal
+define i32 @test_load_l2_normal(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !14
+ ret i32 %v
+}
+
+;-----------------------------------------------------------------------------
+; Metadata definitions
+;-----------------------------------------------------------------------------
+
+; L1 eviction policies
+!0 = !{!100}
+!100 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first"}
+
+!1 = !{!101}
+!101 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last"}
+
+!2 = !{!102}
+!102 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"unchanged"}
+
+!3 = !{!103}
+!103 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"no_allocate"}
+
+; L2 eviction policies
+!4 = !{!104}
+!104 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"first"}
+
+!5 = !{!105}
+!105 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"last"}
+
+; L2 prefetch sizes
+!6 = !{!106}
+!106 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"64B"}
+
+!7 = !{!107}
+!107 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"128B"}
+
+!8 = !{!108}
+!108 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"256B"}
+
+; Invalid operand_no (should be ignored for load which has operand 0)
+!11 = !{!111}
+!111 = !{!"operand_no", i32 5, !"nvvm.l1_eviction", !"first"}
+
+; Unknown key (should be ignored, but valid L1 hint should still work)
+!12 = !{!112}
+!112 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first", !"nvvm.unknown_key", !"value"}
+
+; "normal" eviction (default, should not emit qualifier)
+!13 = !{!113}
+!113 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"normal"}
+
+!14 = !{!114}
+!114 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"normal"}
+
+; All L1 + L2 combinations
+!20 = !{!120}
+!120 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first"}
+
+!21 = !{!121}
+!121 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last"}
+
+!22 = !{!122}
+!122 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first"}
+
+!23 = !{!123}
+!123 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"last"}
+
+; L1 + L2 + Prefetch combination
+!24 = !{!124}
+!124 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"128B"}
+
+; L2::cache_hint with constant cache-policy
+!30 = !{!130}
+!130 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 12345}
+
+!31 = !{!131}
+!131 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 98765}
+
+!32 = !{!132}
+!132 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 55555}
+
+!33 = !{!133}
+!133 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 67890}
+
+!34 = !{!134}
+!134 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 11111}
+
+!35 = !{!135}
+!135 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22222}
+
+; L2::cache_hint for vector types
+!40 = !{!140}
+!140 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 33333}
+
+!41 = !{!141}
+!141 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 44444}
+
+!42 = !{!142}
+!142 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 55556}
+
+!43 = !{!143}
+!143 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 66666}
+
+!44 = !{!144}
+!144 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 77777}
+
+!45 = !{!145}
+!145 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 88888}
+
+!46 = !{!146}
+!146 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 99999}
+
+!47 = !{!147}
+!147 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 11112}
+
+!48 = !{!148}
+!148 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22223}
+
+!49 = !{!149}
+!149 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 33334}
+
+; L2::cache_hint combined with other hints (L2::cache_hint takes precedence)
+!50 = !{!150}
+!150 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 44445, !"nvvm.l1_eviction", !"first"}
+
+!51 = !{!151}
+!151 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 55557, !"nvvm.l2_eviction", !"last"}
+
+!52 = !{!152}
+!152 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 66667, !"nvvm.l2_prefetch_size", !"128B"}
+
+!53 = !{!153}
+!153 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 77778, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"256B"}
+
+!54 = !{!154}
+!154 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 88889, !"nvvm.l1_eviction", !"unchanged"}
+
+!55 = !{!155}
+!155 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 99990, !"nvvm.l1_eviction", !"no_allocate", !"nvvm.l2_eviction", !"last"}
+
+!56 = !{!156}
+!156 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 11113, !"nvvm.l1_eviction", !"first"}
+
+!57 = !{!157}
+!157 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22224, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"64B"}
+
+; Multiple loads same pointer test (different policies)
+!60 = !{!160}
+!160 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 11111, !"nvvm.l1_eviction", !"last"}
+
+!61 = !{!161}
+!161 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22222, !"nvvm.l1_eviction", !"first"}
+
+; Reordered metadata test (operand_no not first)
+!70 = !{!170}
+!170 = !{!"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first", !"operand_no", i32 0}
diff --git a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
index 4be91dfc60c6a..299b0392a04df 100644
--- a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
+++ b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir
@@ -26,10 +26,10 @@ body: |
; CHECK: bb.0.entry:
; CHECK-NEXT: successors: %bb.2(0x30000000), %bb.3(0x50000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[LD_i32_:%[0-9]+]]:b32 = LD_i32 0, 0, 101, 3, 32, -1, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
- ; CHECK-NEXT: [[LD_i64_:%[0-9]+]]:b64 = LD_i64 0, 0, 101, 3, 64, -1, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
+ ; CHECK-NEXT: [[LD_i32_:%[0-9]+]]:b32 = LD_i32 0, 0, 101, 3, 32, -1, 0, 0, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
+ ; CHECK-NEXT: [[LD_i64_:%[0-9]+]]:b64 = LD_i64 0, 0, 101, 3, 64, -1, 0, 0, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
; CHECK-NEXT: [[ADD64ri:%[0-9]+]]:b64 = nuw ADD64ri killed [[LD_i64_]], 2
- ; CHECK-NEXT: [[LD_i32_1:%[0-9]+]]:b32 = LD_i32 0, 0, 1, 3, 32, -1, [[ADD64ri]], 0
+ ; CHECK-NEXT: [[LD_i32_1:%[0-9]+]]:b32 = LD_i32 0, 0, 1, 3, 32, -1, 0, 0, [[ADD64ri]], 0
; CHECK-NEXT: [[SETP_i32ri:%[0-9]+]]:b1 = SETP_i32ri [[LD_i32_]], 0, 0
; CHECK-NEXT: CBranch killed [[SETP_i32ri]], %bb.2
; CHECK-NEXT: {{ $}}
@@ -49,15 +49,15 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: [[PHI1:%[0-9]+]]:b32 = PHI [[LD_i32_1]], %bb.0, [[SREM32rr]], %bb.1
- ; CHECK-NEXT: ST_i32 [[PHI1]], 0, 0, 1, 32, [[ADD64ri]], 0
+ ; CHECK-NEXT: ST_i32 [[PHI1]], 0, 0, 1, 32, 0, [[ADD64ri]], 0, 0
; CHECK-NEXT: Return
bb.0.entry:
successors: %bb.2(0x30000000), %bb.1(0x50000000)
- %5:b32 = LD_i32 0, 0, 101, 3, 32, -1, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
- %6:b64 = LD_i64 0, 0, 101, 3, 64, -1, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
+ %5:b32 = LD_i32 0, 0, 101, 3, 32, -1, 0, 0, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101)
+ %6:b64 = LD_i64 0, 0, 101, 3, 64, -1, 0, 0, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101)
%0:b64 = nuw ADD64ri killed %6, 2
- %1:b32 = LD_i32 0, 0, 1, 3, 32, -1, %0, 0
+ %1:b32 = LD_i32 0, 0, 1, 3, 32, -1, 0, 0, %0, 0
%7:b1 = SETP_i32ri %5, 0, 0
CBranch killed %7, %bb.2
GOTO %bb.1
@@ -75,6 +75,6 @@ body: |
bb.2:
%4:b32 = PHI %1, %bb.0, %3, %bb.1
- ST_i32 %4, 0, 0, 1, 32, %0, 0
+ ST_i32 %4, 0, 0, 1, 32, 0, %0, 0, 0
Return
...
diff --git a/llvm/test/CodeGen/NVPTX/memcpy-cache-hint.ll b/llvm/test/CodeGen/NVPTX/memcpy-cache-hint.ll
new file mode 100644
index 0000000000000..64958bc8a1c88
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/memcpy-cache-hint.ll
@@ -0,0 +1,406 @@
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74 | %ptxas-verify %}
+
+; Test !mem.cache_hint metadata on llvm.memcpy intrinsic
+; For memcpy:
+; operand_no = 0 applies to destination (store side)
+; operand_no = 1 applies to source (load side)
+
+declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1), ptr addrspace(1), i64, i1)
+
+;-----------------------------------------------------------------------------
+; Test memcpy with cache hints on both source and destination
+; Source (operand 1): L1::evict_first, L2::evict_first, L2::128B
+; Dest (operand 0): L1::evict_last, L2::evict_last
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_memcpy_both_hints
+; CHECK-DAG: ld.global.L1::evict_first.L2::evict_first.L2::128B.b
+; CHECK-DAG: st.global.L1::evict_last.L2::evict_last.b
+define void @test_memcpy_both_hints(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !80
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Test memcpy with cache hint only on source (load side)
+; Source (operand 1): L1::evict_first
+; Dest (operand 0): no hint
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_memcpy_src_hint_only
+; CHECK: ld.global.L1::evict_first.b
+; CHECK: st.global.b
+; CHECK-NOT: st.global.L1
+define void @test_memcpy_src_hint_only(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !81
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Test memcpy with cache hint only on destination (store side)
+; Source (operand 1): no hint
+; Dest (operand 0): L2::evict_last
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_memcpy_dest_hint_only
+; CHECK: ld.global.b
+; CHECK-NOT: ld.global.L2
+; CHECK: st.global.L2::evict_last.b
+define void @test_memcpy_dest_hint_only(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !82
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Test memcpy with L2::cache_hint on both operands
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_memcpy_l2_cache_hint
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 34343
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 12121
+; CHECK-DAG: ld.global.L2::cache_hint.b
+; CHECK-DAG: st.global.L2::cache_hint.b
+define void @test_memcpy_l2_cache_hint(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !83
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Test memcpy without cache hints produces plain load/store
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_memcpy_no_hint
+; CHECK: ld.global.b
+; CHECK: st.global.b
+; CHECK-NOT: L1::evict
+; CHECK-NOT: L2::evict
+; CHECK-NOT: L2::cache_hint
+define void @test_memcpy_no_hint(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false)
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Combined L1 + L2 eviction policies
+;-----------------------------------------------------------------------------
+
+; Source: L1::evict_first + L2::evict_last
+; Dest: no hint
+; CHECK-LABEL: test_memcpy_src_l1_l2_combined
+; CHECK: ld.global.L1::evict_first.L2::evict_last.b
+; CHECK: st.global.b
+; CHECK-NOT: st.global.L1
+; CHECK-NOT: st.global.L2
+define void @test_memcpy_src_l1_l2_combined(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !84
+ ret void
+}
+
+; Source: no hint
+; Dest: L1::evict_unchanged + L2::evict_first
+; CHECK-LABEL: test_memcpy_dest_l1_l2_combined
+; CHECK: ld.global.b
+; CHECK-NOT: ld.global.L1
+; CHECK: st.global.L1::evict_unchanged.L2::evict_first.b
+define void @test_memcpy_dest_l1_l2_combined(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !85
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; L1 + prefetch combinations
+;-----------------------------------------------------------------------------
+
+; Source: L1::evict_last + L2::256B prefetch
+; Dest: L1::no_allocate
+; CHECK-LABEL: test_memcpy_l1_prefetch
+; CHECK-DAG: ld.global.L1::evict_last.L2::256B.b
+; CHECK-DAG: st.global.L1::no_allocate.b
+define void @test_memcpy_l1_prefetch(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !86
+ ret void
+}
+
+; Source: L2::64B prefetch only
+; Dest: L2::evict_last only
+; CHECK-LABEL: test_memcpy_prefetch_vs_eviction
+; CHECK: ld.global.L2::64B.b
+; CHECK: st.global.L2::evict_last.b
+define void @test_memcpy_prefetch_vs_eviction(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !87
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; L2::cache_hint combined with other hints
+;-----------------------------------------------------------------------------
+
+; Source: L2::cache_hint + L1::evict_first
+; Dest: no hint
+; CHECK-LABEL: test_memcpy_src_cache_hint_l1
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 55555
+; CHECK: ld.global.L1::evict_first.L2::cache_hint.b
+; CHECK: st.global.b
+; CHECK-NOT: st.global.L2::cache_hint
+define void @test_memcpy_src_cache_hint_l1(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !88
+ ret void
+}
+
+; Source: no hint
+; Dest: L2::cache_hint + L1::evict_last + L2::evict_first
+; CHECK-LABEL: test_memcpy_dest_cache_hint_combined
+; CHECK: ld.global.b
+; CHECK-NOT: ld.global.L2::cache_hint
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 66666
+; CHECK: st.global.L1::evict_last.L2::evict_first.L2::cache_hint.b
+define void @test_memcpy_dest_cache_hint_combined(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !89
+ ret void
+}
+
+; Both operands: L2::cache_hint + L1 eviction + L2 eviction
+; Source: L2::cache_hint(77777) + L1::evict_unchanged + L2::evict_last
+; Dest: L2::cache_hint(88888) + L1::evict_first + L2::evict_first
+; CHECK-LABEL: test_memcpy_both_cache_hint_combined
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 77777
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 88888
+; CHECK-DAG: ld.global.L1::evict_unchanged.L2::evict_last.L2::cache_hint.b
+; CHECK-DAG: st.global.L1::evict_first.L2::evict_first.L2::cache_hint.b
+define void @test_memcpy_both_cache_hint_combined(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !90
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; L2::cache_hint + prefetch combinations
+;-----------------------------------------------------------------------------
+
+; Source: L2::cache_hint + L2::128B prefetch
+; Dest: L2::cache_hint + L1::evict_last
+; CHECK-LABEL: test_memcpy_cache_hint_prefetch
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 11111
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 22222
+; CHECK-DAG: ld.global.L2::cache_hint.L2::128B.b
+; CHECK-DAG: st.global.L1::evict_last.L2::cache_hint.b
+define void @test_memcpy_cache_hint_prefetch(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !91
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Asymmetric hint combinations (complex vs simple)
+;-----------------------------------------------------------------------------
+
+; Source: all hints (L1 + L2 eviction + prefetch)
+; Dest: simple L1 hint only
+; CHECK-LABEL: test_memcpy_complex_src_simple_dest
+; CHECK-DAG: ld.global.L1::evict_first.L2::evict_last.L2::64B.b
+; CHECK-DAG: st.global.L1::evict_last.b
+define void @test_memcpy_complex_src_simple_dest(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !92
+ ret void
+}
+
+; Source: simple L2 prefetch only
+; Dest: all hints (L1 + L2 eviction + L2::cache_hint)
+; CHECK-LABEL: test_memcpy_simple_src_complex_dest
+; CHECK: ld.global.L2::256B.b
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 99999
+; CHECK: st.global.L1::no_allocate.L2::evict_last.L2::cache_hint.b
+define void @test_memcpy_simple_src_complex_dest(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !93
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Different L1 eviction policies on src vs dest
+;-----------------------------------------------------------------------------
+
+; Source: L1::evict_unchanged
+; Dest: L1::evict_first
+; CHECK-LABEL: test_memcpy_different_l1_policies
+; CHECK-DAG: ld.global.L1::evict_unchanged.b
+; CHECK-DAG: st.global.L1::evict_first.b
+define void @test_memcpy_different_l1_policies(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !94
+ ret void
+}
+
+; Source: L1::no_allocate
+; Dest: L1::evict_unchanged
+; CHECK-LABEL: test_memcpy_no_allocate_vs_unchanged
+; CHECK-DAG: ld.global.L1::no_allocate.b
+; CHECK-DAG: st.global.L1::evict_unchanged.b
+define void @test_memcpy_no_allocate_vs_unchanged(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !95
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; All hints maxed out on both operands
+;-----------------------------------------------------------------------------
+
+; Source: L1::evict_first + L2::evict_first + L2::256B + L2::cache_hint
+; Dest: L1::evict_last + L2::evict_last + L2::128B + L2::cache_hint
+; CHECK-LABEL: test_memcpy_all_hints_both
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 12345
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 67890
+; CHECK-DAG: ld.global.L1::evict_first.L2::evict_first.L2::cache_hint.L2::256B.b
+; CHECK-DAG: st.global.L1::evict_last.L2::evict_last.L2::cache_hint.L2::128B.b
+define void @test_memcpy_all_hints_both(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 4, i1 false), !mem.cache_hint !96
+ ret void
+}
+
+;-----------------------------------------------------------------------------
+; Metadata definitions
+;-----------------------------------------------------------------------------
+
+; memcpy with both dest and src hints
+!80 = !{!180, !181}
+; operand 0 (dest/store): L1::evict_last, L2::evict_last
+!180 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"last"}
+; operand 1 (src/load): L1::evict_first, L2::evict_first, L2::128B prefetch
+!181 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"128B"}
+
+; memcpy with only source hint (load side)
+!81 = !{!182}
+!182 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first"}
+
+; memcpy with only dest hint (store side)
+!82 = !{!183}
+!183 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"last"}
+
+; memcpy with L2::cache_hint on both operands
+!83 = !{!184, !185}
+!184 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 12121}
+!185 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 34343}
+
+; Combined L1 + L2 eviction on source only
+!84 = !{!186}
+!186 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last"}
+
+; Combined L1 + L2 eviction on dest only
+!85 = !{!187}
+!187 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"unchanged", !"nvvm.l2_eviction", !"first"}
+
+; L1 + prefetch on source, L1 on dest
+!86 = !{!188, !189}
+!188 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"last", !"nvvm.l2_prefetch_size", !"256B"}
+!189 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"no_allocate"}
+
+; Prefetch on source, L2 eviction on dest
+!87 = !{!190, !191}
+!190 = !{!"operand_no", i32 1, !"nvvm.l2_prefetch_size", !"64B"}
+!191 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"last"}
+
+; L2::cache_hint + L1 eviction on source only
+!88 = !{!192}
+!192 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 55555, !"nvvm.l1_eviction", !"first"}
+
+; L2::cache_hint + L1 + L2 eviction on dest only
+!89 = !{!193}
+!193 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 66666, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first"}
+
+; Both operands: L2::cache_hint + L1 + L2 eviction
+!90 = !{!194, !195}
+!194 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 77777, !"nvvm.l1_eviction", !"unchanged", !"nvvm.l2_eviction", !"last"}
+!195 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 88888, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first"}
+
+; L2::cache_hint + prefetch on source, L2::cache_hint + L1 on dest
+!91 = !{!196, !197}
+!196 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 11111, !"nvvm.l2_prefetch_size", !"128B"}
+!197 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22222, !"nvvm.l1_eviction", !"last"}
+
+; Complex source (all non-cache_hint), simple dest
+!92 = !{!198, !199}
+!198 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"64B"}
+!199 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last"}
+
+; Simple source, complex dest (with cache_hint)
+!93 = !{!200, !201}
+!200 = !{!"operand_no", i32 1, !"nvvm.l2_prefetch_size", !"256B"}
+!201 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 99999, !"nvvm.l1_eviction", !"no_allocate", !"nvvm.l2_eviction", !"last"}
+
+; Different L1 policies: unchanged vs first
+!94 = !{!202, !203}
+!202 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"unchanged"}
+!203 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first"}
+
+; Different L1 policies: no_allocate vs unchanged
+!95 = !{!204, !205}
+!204 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"no_allocate"}
+!205 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"unchanged"}
+
+; All hints maxed out on both operands
+!96 = !{!206, !207}
+!206 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 12345, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"256B"}
+!207 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 67890, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"128B"}
+
+;-----------------------------------------------------------------------------
+; Large memcpy tests - verify hints propagate to all expanded load/stores
+; LLVM expands memcpy to multiple load/store pairs. Each pair should
+; get the appropriate cache hints from the original memcpy metadata.
+; The expansion may use various sizes (b8, b16, b32, v2, v4, etc.)
+;-----------------------------------------------------------------------------
+
+; 16-byte memcpy: verify hints are applied to expanded loads/stores
+; CHECK-LABEL: test_memcpy_16bytes
+; CHECK: ld.global.L1::evict_first.b
+; CHECK: st.global.L1::evict_last.b
+define void @test_memcpy_16bytes(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 16, i1 false), !mem.cache_hint !97
+ ret void
+}
+
+; 32-byte memcpy: all loads should have L1::evict_unchanged, all stores L2::evict_first
+; CHECK-LABEL: test_memcpy_32bytes
+; CHECK: ld.global.L1::evict_unchanged.b
+; CHECK: st.global.L2::evict_first.b
+define void @test_memcpy_32bytes(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 32, i1 false), !mem.cache_hint !98
+ ret void
+}
+
+; 64-byte memcpy with L2::cache_hint
+; All loads and stores should get the L2::cache_hint with their respective policies
+; CHECK-LABEL: test_memcpy_64bytes_cache_hint
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 11111
+; CHECK-DAG: mov.b64 {{%rd[0-9]+}}, 22222
+; CHECK: ld.global.L2::cache_hint.b
+; CHECK: st.global.L2::cache_hint.b
+define void @test_memcpy_64bytes_cache_hint(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 64, i1 false), !mem.cache_hint !99
+ ret void
+}
+
+; 128-byte memcpy with combined hints
+; Note: Large memcpy (>64 bytes) is expanded to a loop in the backend.
+; Cache hints are not preserved for loop-based expansion.
+; This test verifies the code compiles correctly.
+; CHECK-LABEL: test_memcpy_128bytes_combined
+; CHECK: ld.global.b
+; CHECK: st.global.b
+define void @test_memcpy_128bytes_combined(ptr addrspace(1) %dest, ptr addrspace(1) %src) {
+ call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 128, i1 false), !mem.cache_hint !100
+ ret void
+}
+
+; Large memcpy metadata
+!97 = !{!208, !209}
+!208 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first"}
+!209 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last"}
+
+!98 = !{!210, !211}
+!210 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"unchanged"}
+!211 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"first"}
+
+!99 = !{!212, !213}
+!212 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 11111}
+!213 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22222}
+
+!100 = !{!214, !215}
+!214 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"256B"}
+!215 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first"}
diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
index a84b7fcd33836..8e1c7975335c9 100644
--- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
+++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure.mir
@@ -77,22 +77,22 @@ constants: []
machineFunctionInfo: {}
body: |
bb.0:
- %0:b32, %1:b32, %2:b32, %3:b32 = LDV_i32_v4 0, 0, 101, 3, 32, -1, &retval0, 0 :: (load (s128), addrspace 101)
+ %0:b32, %1:b32, %2:b32, %3:b32 = LDV_i32_v4 0, 0, 101, 3, 32, -1, 0, 0, &retval0, 0 :: (load (s128), addrspace 101)
; CHECK-NOT: ProxyReg
%4:b32 = ProxyRegB32 killed %0
%5:b32 = ProxyRegB32 killed %1
%6:b32 = ProxyRegB32 killed %2
%7:b32 = ProxyRegB32 killed %3
; CHECK: STV_i32_v4 killed %0, killed %1, killed %2, killed %3
- STV_i32_v4 killed %4, killed %5, killed %6, killed %7, 0, 0, 101, 32, &func_retval0, 0 :: (store (s128), addrspace 101)
+ STV_i32_v4 killed %4, killed %5, killed %6, killed %7, 0, 0, 101, 32, 0, &func_retval0, 0, 0 :: (store (s128), addrspace 101)
- %8:b32 = LD_i32 0, 0, 101, 3, 32, -1, &retval0, 0 :: (load (s32), addrspace 101)
+ %8:b32 = LD_i32 0, 0, 101, 3, 32, -1, 0, 0, &retval0, 0 :: (load (s32), addrspace 101)
; CHECK-NOT: ProxyReg
%9:b32 = ProxyRegB32 killed %8
%10:b32 = ProxyRegB32 killed %9
%11:b32 = ProxyRegB32 killed %10
; CHECK: ST_i32 killed %8
- ST_i32 killed %11, 0, 0, 101, 32, &func_retval0, 0 :: (store (s32), addrspace 101)
+ ST_i32 killed %11, 0, 0, 101, 32, 0, &func_retval0, 0, 0 :: (store (s32), addrspace 101)
Return
...
>From 476e1456f394bc965ec07db0866c7c46ebae5487 Mon Sep 17 00:00:00 2001
From: Fei Peng <feip at nvidia.com>
Date: Wed, 14 Jan 2026 11:51:02 -0800
Subject: [PATCH 02/10] [SelectionDAG] Fix missing pointer info inference in
getMemcpyLoadsAndStores
When creating MachineMemOperands manually for memcpy lowering, we need
to call InferPointerInfo to recover pointer information from DAG nodes
when the MachinePointerInfo doesn't have a valid Value. This matches
the behavior of the original code path that went through getLoad/getStore
with individual parameters.
Without this fix, byval argument passing on PowerPC (and potentially
other targets) would lose stack slot pointer info, causing test failures.
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 60 ++++++++++++-------
1 file changed, 39 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index a7e033284286f..214ebeac53875 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8767,6 +8767,12 @@ static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
}
}
+// Forward declaration - defined later in this file. Default argument is on
+// the definition.
+static MachinePointerInfo InferPointerInfo(const MachinePointerInfo &Info,
+ SelectionDAG &DAG, SDValue Ptr,
+ int64_t Offset);
+
static SDValue
getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
SDValue Dst, SDValue Src, uint64_t Size,
@@ -8881,20 +8887,24 @@ getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
}
Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
if (Value.getNode()) {
+ SDValue DstPtrOff =
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff));
+
+ // Infer pointer info from DAG if not provided (e.g., for stack slots).
+ MachinePointerInfo DstPI = DstPtrInfo.getWithOffset(DstOff);
+ if (DstPI.V.isNull())
+ DstPI = InferPointerInfo(DstPI, DAG, DstPtrOff, 0);
+
// Create store MMO explicitly to allow targets to record cache hints.
- MachineMemOperand *StoreMMO =
- MF.getMachineMemOperand(DstPtrInfo.getWithOffset(DstOff),
- MMOFlags | MachineMemOperand::MOStore,
- VTSize, Alignment, NewAAInfo);
+ MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
+ DstPI, MMOFlags | MachineMemOperand::MOStore, VTSize, Alignment,
+ NewAAInfo);
// Call hook for target-specific cache hint recording (operand 0 =
// dest).
if (CI)
TLI.recordTargetMMOInfo(StoreMMO, *CI, /*OperandNo=*/0);
- Store = DAG.getStore(
- Chain, dl, Value,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
- StoreMMO);
+ Store = DAG.getStore(Chain, dl, Value, DstPtrOff, StoreMMO);
OutChains.push_back(Store);
}
}
@@ -8916,33 +8926,41 @@ getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
if (isConstant)
SrcMMOFlags |= MachineMemOperand::MOInvariant;
+ // Compute pointer offsets for this iteration.
+ SDValue SrcPtrOff =
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff));
+ SDValue DstPtrOff =
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff));
+
+ // Infer pointer info from DAG if not provided (e.g., for stack slots).
+ MachinePointerInfo SrcPI = SrcPtrInfo.getWithOffset(SrcOff);
+ if (SrcPI.V.isNull())
+ SrcPI = InferPointerInfo(SrcPI, DAG, SrcPtrOff, 0);
+ MachinePointerInfo DstPI = DstPtrInfo.getWithOffset(DstOff);
+ if (DstPI.V.isNull())
+ DstPI = InferPointerInfo(DstPI, DAG, DstPtrOff, 0);
+
// Create load MMO explicitly to allow targets to record cache hints.
MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
- SrcPtrInfo.getWithOffset(SrcOff),
- SrcMMOFlags | MachineMemOperand::MOLoad, VTSize,
+ SrcPI, SrcMMOFlags | MachineMemOperand::MOLoad, VTSize,
commonAlignment(*SrcAlign, SrcOff), NewAAInfo);
// Call hook for target-specific cache hint recording (operand 1 = src).
if (CI)
TLI.recordTargetMMOInfo(LoadMMO, *CI, /*OperandNo=*/1);
- Value = DAG.getExtLoad(
- ISD::EXTLOAD, dl, NVT, Chain,
- DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)), VT,
- LoadMMO);
+ Value =
+ DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain, SrcPtrOff, VT, LoadMMO);
OutLoadChains.push_back(Value.getValue(1));
// Create store MMO explicitly to allow targets to record cache hints.
- MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
- DstPtrInfo.getWithOffset(DstOff),
- MMOFlags | MachineMemOperand::MOStore, VTSize, Alignment, NewAAInfo);
+ MachineMemOperand *StoreMMO =
+ MF.getMachineMemOperand(DstPI, MMOFlags | MachineMemOperand::MOStore,
+ VTSize, Alignment, NewAAInfo);
// Call hook for target-specific cache hint recording (operand 0 = dest).
if (CI)
TLI.recordTargetMMOInfo(StoreMMO, *CI, /*OperandNo=*/0);
- Store = DAG.getTruncStore(
- Chain, dl, Value,
- DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)), VT,
- StoreMMO);
+ Store = DAG.getTruncStore(Chain, dl, Value, DstPtrOff, VT, StoreMMO);
OutStoreChains.push_back(Store);
}
SrcOff += VTSize;
>From 34bb834a8a1f199b006c41e4b20f6409f68c5ffa Mon Sep 17 00:00:00 2001
From: Fei Peng <feip at nvidia.com>
Date: Thu, 15 Jan 2026 11:17:32 -0800
Subject: [PATCH 03/10] [NVPTX] Address review feedback for cache hint support
- Move CachePolicyData from NVPTXTargetMachine to NVPTXMachineFunctionInfo
for proper per-function scoping
- Use ADT/Bitfields.h for type-safe cache hint encoding instead of manual
bit manipulation
- Use TableGen's getNamedOperandIdx() instead of manual LDOp/STOp enums
by adding UseNamedOperandTable to LD/ST instruction classes
- Use range-based for loops in NVPTXUtilities.cpp
- Use mdconst::dyn_extract for metadata constant extraction
---
llvm/include/llvm/CodeGen/TargetLowering.h | 5 +-
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 6 +-
.../SelectionDAG/SelectionDAGBuilder.cpp | 4 +-
llvm/lib/Target/NVPTX/NVPTX.h | 95 ++++++-------------
llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp | 13 ++-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 43 ++++-----
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 11 ++-
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 5 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp | 4 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 66 +++++++------
.../Target/NVPTX/NVPTXMachineFunctionInfo.h | 29 ++++++
.../Target/NVPTX/NVPTXReplaceImageHandles.cpp | 6 +-
llvm/lib/Target/NVPTX/NVPTXTargetMachine.h | 25 -----
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 50 ++++------
14 files changed, 162 insertions(+), 200 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8cfd1df7eb5a6..4e417da246ddd 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -481,12 +481,13 @@ class LLVM_ABI TargetLoweringBase {
/// This is called by SelectionDAGBuilder for load/store instructions
/// and during memcpy/memmove lowering.
/// Targets can use this to store target-specific cache policies or
- /// other per-memop metadata in a side table.
+ /// other per-memop metadata in MachineFunctionInfo.
/// The OperandNo parameter specifies which memory operand of the instruction
/// this MMO corresponds to (used for multi-operand instructions like memcpy
/// where operand 0 is dest and operand 1 is src).
/// The default implementation does nothing.
- virtual void recordTargetMMOInfo(MachineMemOperand *MMO, const Instruction &I,
+ virtual void recordTargetMMOInfo(MachineFunction &MF, MachineMemOperand *MMO,
+ const Instruction &I,
unsigned OperandNo = 0) const {}
virtual bool isSelectSupported(SelectSupportKind /*kind*/) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 214ebeac53875..c5ecbf2b352b8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8902,7 +8902,7 @@ getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
// Call hook for target-specific cache hint recording (operand 0 =
// dest).
if (CI)
- TLI.recordTargetMMOInfo(StoreMMO, *CI, /*OperandNo=*/0);
+ TLI.recordTargetMMOInfo(MF, StoreMMO, *CI, /*OperandNo=*/0);
Store = DAG.getStore(Chain, dl, Value, DstPtrOff, StoreMMO);
OutChains.push_back(Store);
@@ -8946,7 +8946,7 @@ getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
commonAlignment(*SrcAlign, SrcOff), NewAAInfo);
// Call hook for target-specific cache hint recording (operand 1 = src).
if (CI)
- TLI.recordTargetMMOInfo(LoadMMO, *CI, /*OperandNo=*/1);
+ TLI.recordTargetMMOInfo(MF, LoadMMO, *CI, /*OperandNo=*/1);
Value =
DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain, SrcPtrOff, VT, LoadMMO);
@@ -8958,7 +8958,7 @@ getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
VTSize, Alignment, NewAAInfo);
// Call hook for target-specific cache hint recording (operand 0 = dest).
if (CI)
- TLI.recordTargetMMOInfo(StoreMMO, *CI, /*OperandNo=*/0);
+ TLI.recordTargetMMOInfo(MF, StoreMMO, *CI, /*OperandNo=*/0);
Store = DAG.getTruncStore(Chain, dl, Value, DstPtrOff, VT, StoreMMO);
OutStoreChains.push_back(Store);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b3e653c5a8483..957f1655f587a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4773,7 +4773,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
MachineMemOperand *MMO =
MF.getMachineMemOperand(PtrInfo, MMOFlags | MachineMemOperand::MOLoad,
Size, Alignment, AAInfo, Ranges);
- TLI.recordTargetMMOInfo(MMO, I);
+ TLI.recordTargetMMOInfo(MF, MMO, I);
SDValue L = DAG.getLoad(MemVTs[i], dl, Root, A, MMO);
Chains[ChainI] = L.getValue(1);
@@ -4924,7 +4924,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
MachineMemOperand *MMO =
MF.getMachineMemOperand(PtrInfo, MMOFlags | MachineMemOperand::MOStore,
Size, Alignment, AAInfo);
- TLI.recordTargetMMOInfo(MMO, I);
+ TLI.recordTargetMMOInfo(MF, MMO, I);
SDValue St = DAG.getStore(Root, dl, Val, Add, MMO);
Chains[ChainI] = St;
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index a1d4e70d1bc26..18d3d9328de67 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -16,14 +16,12 @@
#include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Bitfields.h"
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Target/TargetMachine.h"
-#include <optional>
namespace llvm {
class Function;
@@ -207,7 +205,7 @@ enum AddressSpace : AddressSpaceUnderlyingType {
// These correspond to PTX cache control qualifiers
// L1 Eviction Policy - maps to PTX L1::evict_* qualifiers
-enum class L1Eviction : unsigned {
+enum class L1Eviction : uint8_t {
Normal = 0, // Default behavior (no qualifier)
Unchanged = 1, // L1::evict_unchanged
First = 2, // L1::evict_first
@@ -216,26 +214,35 @@ enum class L1Eviction : unsigned {
};
// L2 Eviction Policy - maps to PTX L2::evict_* qualifiers
-enum class L2Eviction : unsigned {
+enum class L2Eviction : uint8_t {
Normal = 0, // Default behavior (no qualifier)
First = 1, // L2::evict_first
Last = 2, // L2::evict_last
};
// L2 Prefetch Size - maps to PTX L2::*B qualifiers
-enum class L2Prefetch : unsigned {
+enum class L2Prefetch : uint8_t {
None = 0, // No prefetch hint
Bytes64 = 1, // L2::64B
Bytes128 = 2, // L2::128B
Bytes256 = 3, // L2::256B
};
-// Bitfield layout for encoded cache hints:
+// Bitfield layout for encoded cache hints (stored in unsigned):
// Bits 0-2: L1 Eviction (3 bits, 5 values)
// Bits 3-4: L2 Eviction (2 bits, 3 values)
// Bits 5-6: L2 Prefetch (2 bits, 4 values)
// Bit 7: L2::cache_hint mode flag (when set, use CachePolicy operand)
// Bits 8-31: Reserved
+//
+// Using llvm::Bitfield for type-safe access with compile-time validation.
+using L1EvictionBits =
+ Bitfield::Element<L1Eviction, 0, 3, L1Eviction::NoAllocate>;
+using L2EvictionBits = Bitfield::Element<L2Eviction, 3, 2, L2Eviction::Last>;
+using L2PrefetchBits = Bitfield::Element<L2Prefetch, 5, 2, L2Prefetch::Bytes256>;
+using L2CacheHintBit = Bitfield::Element<bool, 7, 1>;
+
+// Masks for clearing/testing fields (for legacy code and instruction emission)
constexpr unsigned L1EvictionShift = 0;
constexpr unsigned L1EvictionMask = 0x7;
constexpr unsigned L2EvictionShift = 3;
@@ -245,80 +252,29 @@ constexpr unsigned L2PrefetchMask = 0x3;
constexpr unsigned L2CacheHintFlag = 0x80; // Bit 7: L2::cache_hint mode
inline unsigned encodeCacheHint(L1Eviction L1, L2Eviction L2, L2Prefetch P) {
- return (static_cast<unsigned>(L1) << L1EvictionShift) |
- (static_cast<unsigned>(L2) << L2EvictionShift) |
- (static_cast<unsigned>(P) << L2PrefetchShift);
+ unsigned Hint = 0;
+ Bitfield::set<L1EvictionBits>(Hint, L1);
+ Bitfield::set<L2EvictionBits>(Hint, L2);
+ Bitfield::set<L2PrefetchBits>(Hint, P);
+ return Hint;
}
inline L1Eviction decodeL1Eviction(unsigned Hint) {
- return static_cast<L1Eviction>((Hint >> L1EvictionShift) & L1EvictionMask);
+ return Bitfield::get<L1EvictionBits>(Hint);
}
inline L2Eviction decodeL2Eviction(unsigned Hint) {
- return static_cast<L2Eviction>((Hint >> L2EvictionShift) & L2EvictionMask);
+ return Bitfield::get<L2EvictionBits>(Hint);
}
inline L2Prefetch decodeL2Prefetch(unsigned Hint) {
- return static_cast<L2Prefetch>((Hint >> L2PrefetchShift) & L2PrefetchMask);
+ return Bitfield::get<L2PrefetchBits>(Hint);
}
inline bool isL2CacheHintMode(unsigned Hint) {
- return (Hint & L2CacheHintFlag) != 0;
+ return Bitfield::get<L2CacheHintBit>(Hint);
}
-// Cache policy data for a single memory operation.
-// Stored per-MMO to avoid pointer collisions when multiple memops share
-// the same pointer value but have different cache policies.
-struct MMOCachePolicyData {
- uint64_t
- Policy; // The 64-bit cache policy value for L2::cache_hint (0 if not set)
- unsigned CacheHint; // Other cache hints (L1 eviction, L2 eviction, prefetch)
-};
-
-// Per-function cache policy data. Keyed by MachineMemOperand* for direct
-// lookup during instruction selection, ensuring each memop gets its own policy.
-struct FunctionCachePolicyData {
- DenseMap<MachineMemOperand *, MMOCachePolicyData> MMOMap;
-
- void clear() { MMOMap.clear(); }
-};
-
-// Operand indices for LD (load) machine instructions.
-// These match the operand order in NVPTXInstrInfo.td LD class.
-// Use LDOp::* for getOperand() (includes def), or subtract 1 for uses()
-// iterator.
-namespace LDOp {
-enum : unsigned {
- Dst = 0, // Output register (def)
- Ordering = 1, // Memory ordering (sem)
- Scope = 2, // Memory scope
- AddrSpace = 3, // Address space
- Sign = 4, // Signedness
- Width = 5, // Load width in bits
- UsedBytes = 6, // Used bytes mask
- CacheHint = 7, // Cache hint flags
- Base = 8, // Base pointer (from ADDR)
- Offset = 9, // Offset (from ADDR)
- Policy = 10 // Cache policy register
-};
-} // namespace LDOp
-
-// Operand indices for ST (store) machine instructions.
-// These match the operand order in NVPTXInstrInfo.td ST class.
-namespace STOp {
-enum : unsigned {
- Value = 0, // Value to store
- Ordering = 1, // Memory ordering (sem)
- Scope = 2, // Memory scope
- AddrSpace = 3, // Address space
- Width = 4, // Store width in bits
- CacheHint = 5, // Cache hint flags
- Base = 6, // Base pointer (from ADDR)
- Offset = 7, // Offset (from ADDR)
- Policy = 8 // Cache policy register
-};
-} // namespace STOp
-
namespace PTXLdStInstCode {
enum FromType { Unsigned = 0, Signed, Float, Untyped };
} // namespace PTXLdStInstCode
@@ -399,4 +355,9 @@ void initializeNVPTXDAGToDAGISelLegacyPass(PassRegistry &);
#define GET_INSTRINFO_MC_HELPER_DECLS
#include "NVPTXGenInstrInfo.inc"
+// Pull in OpName enum and getNamedOperandIdx() for LD/ST instructions.
+// Generated from UseNamedOperandTable=1 in NVPTXInstrInfo.td.
+#define GET_INSTRINFO_OPERAND_ENUM
+#include "NVPTXGenInstrInfo.inc"
+
#endif
diff --git a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
index 25eb2da33bbd9..62214717a646e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
@@ -96,14 +96,13 @@ static bool eliminateMove(MachineInstr &Mov, const MachineRegisterInfo &MRI,
const MachineOperand *ParamSymbol = Mov.uses().begin();
assert(ParamSymbol->isSymbol());
- // uses() iterator skips defs, so subtract 1 from LDOp indices
- constexpr unsigned LDUsesBasePtrIdx = NVPTX::LDOp::Base - 1;
- constexpr unsigned LDUsesAddrSpaceIdx = NVPTX::LDOp::AddrSpace - 1;
for (auto *LI : LoadInsts) {
- (LI->uses().begin() + LDUsesBasePtrIdx)
- ->ChangeToES(ParamSymbol->getSymbolName());
- (LI->uses().begin() + LDUsesAddrSpaceIdx)
- ->ChangeToImmediate(NVPTX::AddressSpace::Param);
+ int AddrIdx = NVPTX::getNamedOperandIdx(LI->getOpcode(), NVPTX::OpName::addr);
+ int AddspIdx =
+ NVPTX::getNamedOperandIdx(LI->getOpcode(), NVPTX::OpName::addsp);
+ assert(AddrIdx != -1 && AddspIdx != -1 && "Expected LD instruction");
+ LI->getOperand(AddrIdx).ChangeToES(ParamSymbol->getSymbolName());
+ LI->getOperand(AddspIdx).ChangeToImmediate(NVPTX::AddressSpace::Param);
}
return true;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 9b28e2a0967ab..257b5cd8f13c7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -12,6 +12,7 @@
#include "NVPTXISelDAGToDAG.h"
#include "NVPTX.h"
+#include "NVPTXMachineFunctionInfo.h"
#include "NVPTXUtilities.h"
#include "llvm/ADT/APInt.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -70,13 +71,7 @@ bool NVPTXDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
Subtarget = &MF.getSubtarget<NVPTXSubtarget>();
Scopes = NVPTXScopes(MF.getFunction().getContext());
- bool Result = SelectionDAGISel::runOnMachineFunction(MF);
-
- // Clear per-function cache policy data after instruction selection completes
- // to prevent memory growth over time.
- TM.clearCachePolicyData(&MF.getFunction());
-
- return Result;
+ return SelectionDAGISel::runOnMachineFunction(MF);
}
NVPTX::DivPrecisionLevel
@@ -1119,45 +1114,43 @@ bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
// Helper to extract cache hint from a MemSDNode via MMO lookup.
// The cache hint is stored per-MMO by recordTargetMMOInfo().
-static unsigned getCacheHint(const MemSDNode *N, const Function &F,
- const NVPTXTargetMachine &TM) {
+static unsigned getCacheHint(const MemSDNode *N,
+ const NVPTXMachineFunctionInfo *MFI) {
MachineMemOperand *MMO = N->getMemOperand();
- if (!MMO)
+ if (!MMO || !MFI)
return 0;
- auto &Data = TM.getCachePolicyData(&F);
- auto It = Data.MMOMap.find(MMO);
- if (It == Data.MMOMap.end())
+ const auto *Data = MFI->getCachePolicyData(MMO);
+ if (!Data)
return 0;
- return It->second.CacheHint;
+ return Data->CacheHint;
}
// Helper to get cache policy value if present (for L2::cache_hint mode).
// Returns the 64-bit policy descriptor stored per-MMO.
-static std::optional<uint64_t> getCachePolicy(const MemSDNode *N,
- const Function &F,
- const NVPTXTargetMachine &TM) {
+static std::optional<uint64_t>
+getCachePolicy(const MemSDNode *N, const NVPTXMachineFunctionInfo *MFI) {
MachineMemOperand *MMO = N->getMemOperand();
- if (!MMO)
+ if (!MMO || !MFI)
return std::nullopt;
- auto &Data = TM.getCachePolicyData(&F);
- auto It = Data.MMOMap.find(MMO);
- if (It == Data.MMOMap.end())
+ const auto *Data = MFI->getCachePolicyData(MMO);
+ if (!Data)
return std::nullopt;
// Only return policy if L2CacheHintFlag is set (indicating policy mode)
- if (!(It->second.CacheHint & NVPTX::L2CacheHintFlag))
+ if (!(Data->CacheHint & NVPTX::L2CacheHintFlag))
return std::nullopt;
- return It->second.Policy;
+ return Data->Policy;
}
std::pair<unsigned, SDValue> NVPTXDAGToDAGISel::getCacheHintAndPolicyReg(
const MemSDNode *N, unsigned CodeAddrSpace, const SDLoc &DL) {
// Extract cache hint from MMO flags
- unsigned CacheHint = getCacheHint(N, MF->getFunction(), TM);
+ auto *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
+ unsigned CacheHint = getCacheHint(N, MFI);
SDValue PolicyReg;
// Apply SM version guards for cache hints (from PTX ISA documentation):
@@ -1206,7 +1199,7 @@ std::pair<unsigned, SDValue> NVPTXDAGToDAGISel::getCacheHintAndPolicyReg(
CacheHint &= ~NVPTX::L2CacheHintFlag;
} else if (Subtarget->hasL2CacheHint()) {
// Check for L2::cache_hint with cache-policy (requires SM 80+ and PTX 7.4+)
- if (auto CachePolicyVal = getCachePolicy(N, MF->getFunction(), TM)) {
+ if (auto CachePolicyVal = getCachePolicy(N, MFI)) {
SDValue PolicyConst =
CurDAG->getTargetConstant(*CachePolicyVal, DL, MVT::i64);
PolicyReg = SDValue(
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b3d45e68fbdcb..0dd720681985b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -15,6 +15,7 @@
#include "MCTargetDesc/NVPTXBaseInfo.h"
#include "NVPTX.h"
#include "NVPTXISelDAGToDAG.h"
+#include "NVPTXMachineFunctionInfo.h"
#include "NVPTXSelectionDAGInfo.h"
#include "NVPTXSubtarget.h"
#include "NVPTXTargetMachine.h"
@@ -7354,7 +7355,8 @@ NVPTXTargetLowering::getTargetMMOFlags(const Instruction &I) const {
return MachineMemOperand::MONone;
}
-void NVPTXTargetLowering::recordTargetMMOInfo(MachineMemOperand *MMO,
+void NVPTXTargetLowering::recordTargetMMOInfo(MachineFunction &MF,
+ MachineMemOperand *MMO,
const Instruction &I,
unsigned OperandNo) const {
// Check for !mem.cache_hint metadata on memory-accessing instructions.
@@ -7379,10 +7381,9 @@ void NVPTXTargetLowering::recordTargetMMOInfo(MachineMemOperand *MMO,
if (CacheHint == 0 && CachePolicy == 0)
return;
- // Store in per-function map keyed by MMO pointer
- const Function *F = I.getFunction();
- auto &Data = nvTM->getCachePolicyData(F);
- Data.MMOMap[MMO] = {CachePolicy, CacheHint};
+ // Store in MachineFunctionInfo keyed by MMO pointer
+ auto *MFI = MF.getInfo<NVPTXMachineFunctionInfo>();
+ MFI->setCachePolicyData(MMO, CachePolicy, CacheHint);
}
NVPTXTargetLowering::AtomicExpansionKind
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 3e27a47f2ffd4..77df032d03e1f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -179,10 +179,11 @@ class NVPTXTargetLowering : public TargetLowering {
/// Record cache policy info for a MachineMemOperand.
/// Called by SelectionDAGBuilder after creating an MMO from an IR
- /// instruction. Stores policy/hints in a per-MMO map for lookup during
+ /// instruction. Stores policy/hints in MachineFunctionInfo for lookup during
/// instruction selection. OperandNo specifies which memory operand (for
/// memcpy: 0=dest, 1=src).
- void recordTargetMMOInfo(MachineMemOperand *MMO, const Instruction &I,
+ void recordTargetMMOInfo(MachineFunction &MF, MachineMemOperand *MMO,
+ const Instruction &I,
unsigned OperandNo) const override;
AtomicExpansionKind
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index db2d96f5ff532..dc35a8338e009 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -22,6 +22,10 @@ using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "NVPTXGenInstrInfo.inc"
+// Implementation of getNamedOperandIdx() for LD/ST instructions.
+#define GET_INSTRINFO_NAMED_OPS
+#include "NVPTXGenInstrInfo.inc"
+
// Pin the vtable to this file.
void NVPTXInstrInfo::anchor() {}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f3125a9b9f381..374db8d2973b0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1860,7 +1860,9 @@ class LD<NVPTXRegClass regclass>
"${usedBytes}"
"ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
"cacheHint:l2}${cacheHint:prefetch}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr]${policy};">;
+ "\t$dst, [$addr]${policy};"> {
+ let UseNamedOperandTable = 1;
+}
let mayLoad=1, hasSideEffects=0 in {
def LD_i16 : LD<B16>;
@@ -1875,7 +1877,9 @@ class ST<DAGOperand O>
ADDR:$addr, CachePolicy:$policy),
"st${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
"cacheHint:l2}${cacheHint:prefetch}.b$toWidth"
- " \t[$addr], $src${policy};">;
+ " \t[$addr], $src${policy};"> {
+ let UseNamedOperandTable = 1;
+}
let mayStore=1, hasSideEffects=0 in {
def ST_i16 : ST<RI16>;
@@ -1887,41 +1891,43 @@ let mayStore=1, hasSideEffects=0 in {
// elementization happens at the machine instruction level, so the following
// instructions never appear in the DAG.
multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
- def _v2
- : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2),
- (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
- AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
- CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
- "${usedBytes}"
- "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
- "cacheHint:l2}${cacheHint:prefetch}.v2.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2}}, [$addr]${policy};">;
- def _v4
- : NVPTXInst<
- (outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4),
- (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
- AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
- CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
- "${usedBytes}"
- "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
- "cacheHint:l2}${cacheHint:prefetch}.v4.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr]${policy};">;
- if support_v8 then
- def _v8
+ let UseNamedOperandTable = 1 in {
+ def _v2
+ : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+ AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
+ CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
+ "${usedBytes}"
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
+ "cacheHint:l2}${cacheHint:prefetch}.v2.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2}}, [$addr]${policy};">;
+ def _v4
: NVPTXInst<
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4, regclass:$dst5, regclass:$dst6,
- regclass:$dst7, regclass:$dst8),
+ regclass:$dst4),
(ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
"${usedBytes}"
"ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
- "cacheHint:l2}${cacheHint:prefetch}.v8.${Sign:sign}$fromWidth "
- "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, "
- "[$addr]${policy};">;
+ "cacheHint:l2}${cacheHint:prefetch}.v4.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr]${policy};">;
+ if support_v8 then
+ def _v8
+ : NVPTXInst<
+ (outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4, regclass:$dst5, regclass:$dst6,
+ regclass:$dst7, regclass:$dst8),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+ AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
+ CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
+ "${usedBytes}"
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
+ "cacheHint:l2}${cacheHint:prefetch}.v8.${Sign:sign}$fromWidth "
+ "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, "
+ "[$addr]${policy};">;
+ }
}
let mayLoad=1, hasSideEffects=0 in {
defm LDV_i16 : LD_VEC<B16>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
index 8feae341893aa..cc548ccfa2607 100644
--- a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -14,16 +14,30 @@
#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXMACHINEFUNCTIONINFO_H
#define LLVM_LIB_TARGET_NVPTX_NVPTXMACHINEFUNCTIONINFO_H
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineFunction.h"
namespace llvm {
+
+// Cache policy data for a single memory operation.
+// Stored per-MMO to avoid pointer collisions when multiple memops share
+// the same pointer value but have different cache policies.
+struct NVPTXMMOCachePolicyData {
+ uint64_t Policy; // The 64-bit cache policy value for L2::cache_hint
+ unsigned CacheHint; // Other cache hints (L1 eviction, L2 eviction, prefetch)
+};
+
class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
private:
/// Stores a mapping from index to symbol name for image handles that are
/// replaced with image references
SmallVector<std::string, 8> ImageHandleList;
+ /// Per-MMO cache policy data for !mem.cache_hint metadata.
+ /// Keyed by MachineMemOperand* for direct lookup during instruction selection.
+ DenseMap<MachineMemOperand *, NVPTXMMOCachePolicyData> CachePolicyMap;
+
public:
NVPTXMachineFunctionInfo(const Function &F, const TargetSubtargetInfo *STI) {}
@@ -52,6 +66,21 @@ class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
bool checkImageHandleSymbol(StringRef Symbol) const {
return llvm::is_contained(ImageHandleList, Symbol);
}
+
+ /// Store cache policy data for a MachineMemOperand.
+ void setCachePolicyData(MachineMemOperand *MMO, uint64_t Policy,
+ unsigned CacheHint) {
+ CachePolicyMap[MMO] = {Policy, CacheHint};
+ }
+
+ /// Get cache policy data for a MachineMemOperand.
+ /// Returns nullptr if no data exists for this MMO.
+ const NVPTXMMOCachePolicyData *getCachePolicyData(MachineMemOperand *MMO) const {
+ auto It = CachePolicyMap.find(MMO);
+ if (It == CachePolicyMap.end())
+ return nullptr;
+ return &It->second;
+ }
};
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index fadceefe0a00c..35085aa43ffe0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -1808,9 +1808,11 @@ bool NVPTXReplaceImageHandles::replaceImageHandle(MachineOperand &Op,
// For CUDA, we preserve the param loads coming from function arguments
return false;
- assert(TexHandleDef.getOperand(NVPTX::LDOp::Base).isSymbol() &&
+ int AddrIdx = NVPTX::getNamedOperandIdx(TexHandleDef.getOpcode(),
+ NVPTX::OpName::addr);
+ assert(AddrIdx != -1 && TexHandleDef.getOperand(AddrIdx).isSymbol() &&
"Load is not a symbol!");
- StringRef Sym = TexHandleDef.getOperand(NVPTX::LDOp::Base).getSymbolName();
+ StringRef Sym = TexHandleDef.getOperand(AddrIdx).getSymbolName();
InstrsToRemove.insert(&TexHandleDef);
Op.ChangeToES(Sym.data());
MFI->getImageHandleSymbolIndex(Sym);
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
index 3fc009af71a37..40870cd154c05 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -33,18 +33,6 @@ class NVPTXTargetMachine : public CodeGenTargetMachineImpl {
BumpPtrAllocator StrAlloc;
UniqueStringSaver StrPool;
- // Per-function cache policy storage for !mem.cache_hint metadata.
- // Mutable because it's modified during const lowering operations.
- // Data is keyed by Function* and each function is processed sequentially
- // through the pipeline, so no synchronization is needed.
- // IMPORTANT: Data must be cleared after instruction selection completes
- // via clearCachePolicyData() in NVPTXDAGToDAGISel::runOnMachineFunction().
- // The unique_ptr ensures cleanup even if clearCachePolicyData is not called,
- // but explicit clearing prevents unbounded memory growth.
- mutable DenseMap<const Function *,
- std::unique_ptr<NVPTX::FunctionCachePolicyData>>
- CachePolicyData;
-
public:
NVPTXTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
@@ -89,19 +77,6 @@ class NVPTXTargetMachine : public CodeGenTargetMachineImpl {
std::pair<const Value *, unsigned>
getPredicatedAddrSpace(const Value *V) const override;
-
- // Cache policy data management for !mem.cache_hint metadata.
- // These methods are const but modify mutable state.
- NVPTX::FunctionCachePolicyData &getCachePolicyData(const Function *F) const {
- auto &Data = CachePolicyData[F];
- if (!Data)
- Data = std::make_unique<NVPTX::FunctionCachePolicyData>();
- return *Data;
- }
-
- void clearCachePolicyData(const Function *F) const {
- CachePolicyData.erase(F);
- }
}; // NVPTXTargetMachine.
class NVPTXTargetMachine32 : public NVPTXTargetMachine {
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 7dd3fc7f4ffa8..3ff0e7ecf8411 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -21,6 +21,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Alignment.h"
#include "llvm/Support/Debug.h"
@@ -422,8 +423,8 @@ static const MDNode *findCacheHintNode(const MDNode *MD, unsigned OperandNo) {
if (!MD)
return nullptr;
- for (unsigned i = 0, e = MD->getNumOperands(); i < e; ++i) {
- const MDNode *Node = dyn_cast<MDNode>(MD->getOperand(i));
+ for (const MDOperand &Op : MD->operands()) {
+ const auto *Node = dyn_cast<MDNode>(Op);
if (!Node || Node->getNumOperands() < 2) {
LLVM_DEBUG(if (Node) dbgs()
<< "NVPTX: Skipping malformed cache hint node with "
@@ -432,21 +433,16 @@ static const MDNode *findCacheHintNode(const MDNode *MD, unsigned OperandNo) {
}
// Search for operand_no in the node (can be at any position)
+ // Key-value pairs require index iteration with stride 2
std::optional<unsigned> NodeOperandNo;
for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
- const MDString *Key = dyn_cast<MDString>(Node->getOperand(j));
+ const auto *Key = dyn_cast<MDString>(Node->getOperand(j));
if (Key && Key->getString() == "operand_no") {
- if (auto *OpNoMD =
- dyn_cast<ConstantAsMetadata>(Node->getOperand(j + 1))) {
- if (auto *OpNoCI = dyn_cast<ConstantInt>(OpNoMD->getValue()))
- NodeOperandNo = OpNoCI->getZExtValue();
- else
- LLVM_DEBUG(dbgs()
- << "NVPTX: operand_no value is not ConstantInt\n");
- } else {
- LLVM_DEBUG(dbgs()
- << "NVPTX: operand_no value is not ConstantAsMetadata\n");
- }
+ if (auto *OpNoCI =
+ mdconst::dyn_extract<ConstantInt>(Node->getOperand(j + 1)))
+ NodeOperandNo = OpNoCI->getZExtValue();
+ else
+ LLVM_DEBUG(dbgs() << "NVPTX: operand_no value is not ConstantInt\n");
break;
}
}
@@ -477,8 +473,9 @@ unsigned getCacheHintFromMetadata(const Instruction *I, unsigned OperandNo) {
L2Prefetch Prefetch = L2Prefetch::None;
// Parse all key-value pairs from the matching node
+ // Key-value pairs require index iteration with stride 2
for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
- const MDString *Key = dyn_cast<MDString>(Node->getOperand(j));
+ const auto *Key = dyn_cast<MDString>(Node->getOperand(j));
if (!Key) {
LLVM_DEBUG(dbgs() << "NVPTX: Cache hint key at index " << j
<< " is not a string\n");
@@ -490,7 +487,7 @@ unsigned getCacheHintFromMetadata(const Instruction *I, unsigned OperandNo) {
continue; // Already processed by findCacheHintNode
// For eviction and prefetch hints, value should be a string
- const MDString *Val = dyn_cast<MDString>(Node->getOperand(j + 1));
+ const auto *Val = dyn_cast<MDString>(Node->getOperand(j + 1));
if (!Val) {
// nvvm.l2_cache_hint uses i64, not string - skip here
if (KeyStr != "nvvm.l2_cache_hint") {
@@ -537,26 +534,19 @@ std::optional<uint64_t> getCachePolicyFromMetadata(const Instruction *I,
return std::nullopt;
// Look for nvvm.l2_cache_hint in the matching node
+ // Key-value pairs require index iteration with stride 2
for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
- const MDString *Key = dyn_cast<MDString>(Node->getOperand(j));
+ const auto *Key = dyn_cast<MDString>(Node->getOperand(j));
if (!Key || Key->getString() != "nvvm.l2_cache_hint")
continue;
// The value should be an i64 constant
- auto *ValMD = dyn_cast<ConstantAsMetadata>(Node->getOperand(j + 1));
- if (!ValMD) {
- LLVM_DEBUG(dbgs() << "NVPTX: nvvm.l2_cache_hint value is not "
- "ConstantAsMetadata\n");
- continue;
- }
- auto *ValCI = dyn_cast<ConstantInt>(ValMD->getValue());
- if (!ValCI) {
- LLVM_DEBUG(
- dbgs() << "NVPTX: nvvm.l2_cache_hint value is not ConstantInt\n");
- continue;
- }
+ if (auto *ValCI =
+ mdconst::dyn_extract<ConstantInt>(Node->getOperand(j + 1)))
+ return ValCI->getZExtValue();
- return ValCI->getZExtValue();
+ LLVM_DEBUG(
+ dbgs() << "NVPTX: nvvm.l2_cache_hint value is not ConstantInt\n");
}
return std::nullopt;
>From 36d51368ea84c609623741797fb4306a60645959 Mon Sep 17 00:00:00 2001
From: Fei Peng <feip at nvidia.com>
Date: Thu, 15 Jan 2026 11:37:11 -0800
Subject: [PATCH 04/10] [NVPTX] Add tests for invalid nvvm.l2_cache_hint values
Add tests to verify that when nvvm.l2_cache_hint key exists but the
value is not a valid constant (string or null metadata), we do NOT
emit L2::cache_hint mode. Other hints (L1/L2 eviction, prefetch)
should still work correctly in these cases.
Test cases:
- String value instead of i64 constant
- Null/metadata reference instead of constant
- i32 instead of i64 (still works - ConstantInt accepts any int type)
---
.../CodeGen/NVPTX/load-store-cache-hint.ll | 60 +++++++++++++++++++
1 file changed, 60 insertions(+)
diff --git a/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll b/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
index 13a1b5a2ffd50..abb5f902fb39f 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
@@ -603,6 +603,49 @@ define i32 @test_load_reordered_metadata(ptr addrspace(1) %p) {
ret i32 %v
}
+;-----------------------------------------------------------------------------
+; nvvm.l2_cache_hint with invalid value - should NOT emit L2::cache_hint
+; These tests verify that when nvvm.l2_cache_hint key exists but the value
+; is not a valid i64 constant, we do NOT emit L2::cache_hint mode.
+;-----------------------------------------------------------------------------
+
+; nvvm.l2_cache_hint with string value instead of i64 - should be ignored
+; CHECK-LABEL: test_load_cache_hint_string_value
+; CHECK: ld.global.L1::evict_first.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}];
+; CHECK-NOT: L2::cache_hint
+define i32 @test_load_cache_hint_string_value(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !80
+ ret i32 %v
+}
+
+; nvvm.l2_cache_hint with null/missing value - should be ignored
+; CHECK-LABEL: test_load_cache_hint_null_value
+; CHECK: ld.global.L1::evict_last.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}];
+; CHECK-NOT: L2::cache_hint
+define i32 @test_load_cache_hint_null_value(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !81
+ ret i32 %v
+}
+
+; nvvm.l2_cache_hint with wrong type (i32 instead of i64) - should still work
+; as mdconst::dyn_extract<ConstantInt> accepts any integer type
+; CHECK-LABEL: test_load_cache_hint_i32_value
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 99999
+; CHECK: ld.global.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+define i32 @test_load_cache_hint_i32_value(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !mem.cache_hint !82
+ ret i32 %v
+}
+
+; Store: nvvm.l2_cache_hint with string value - should be ignored
+; CHECK-LABEL: test_store_cache_hint_string_value
+; CHECK: st.global.L1::evict_unchanged.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}};
+; CHECK-NOT: L2::cache_hint
+define void @test_store_cache_hint_string_value(ptr addrspace(1) %p, i32 %v) {
+ store i32 %v, ptr addrspace(1) %p, !mem.cache_hint !83
+ ret void
+}
+
; Test "normal" eviction - should not emit any qualifier (default behavior)
; CHECK-LABEL: test_load_l1_normal
; CHECK: ld.global.b32
@@ -771,3 +814,20 @@ define i32 @test_load_l2_normal(ptr addrspace(1) %p) {
; Reordered metadata test (operand_no not first)
!70 = !{!170}
!170 = !{!"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first", !"operand_no", i32 0}
+
+; Invalid nvvm.l2_cache_hint values - should be ignored, no L2::cache_hint emitted
+; String value instead of i64 - invalid, L1 hint should still work
+!80 = !{!180}
+!180 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first", !"nvvm.l2_cache_hint", !"not_a_number"}
+
+; Null/metadata reference instead of constant - invalid
+!81 = !{!181}
+!181 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last", !"nvvm.l2_cache_hint", !{}}
+
+; i32 instead of i64 - still valid, ConstantInt accepts any integer type
+!82 = !{!182}
+!182 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i32 99999}
+
+; Store: string value for nvvm.l2_cache_hint - invalid
+!83 = !{!183}
+!183 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"unchanged", !"nvvm.l2_cache_hint", !"invalid"}
>From 9150bbea353670ca7c79125cd07572b04e2f16c8 Mon Sep 17 00:00:00 2001
From: Fei Peng <feip at nvidia.com>
Date: Thu, 15 Jan 2026 11:39:38 -0800
Subject: [PATCH 05/10] [NVPTX] Fix clang-format issues in cache hint support
---
llvm/lib/Target/NVPTX/NVPTX.h | 3 ++-
llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp | 3 ++-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 5 +++--
llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h | 6 ++++--
4 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 18d3d9328de67..c23321e6634be 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -239,7 +239,8 @@ enum class L2Prefetch : uint8_t {
using L1EvictionBits =
Bitfield::Element<L1Eviction, 0, 3, L1Eviction::NoAllocate>;
using L2EvictionBits = Bitfield::Element<L2Eviction, 3, 2, L2Eviction::Last>;
-using L2PrefetchBits = Bitfield::Element<L2Prefetch, 5, 2, L2Prefetch::Bytes256>;
+using L2PrefetchBits =
+ Bitfield::Element<L2Prefetch, 5, 2, L2Prefetch::Bytes256>;
using L2CacheHintBit = Bitfield::Element<bool, 7, 1>;
// Masks for clearing/testing fields (for legacy code and instruction emission)
diff --git a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
index 62214717a646e..25103ae54c865 100644
--- a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp
@@ -97,7 +97,8 @@ static bool eliminateMove(MachineInstr &Mov, const MachineRegisterInfo &MRI,
assert(ParamSymbol->isSymbol());
for (auto *LI : LoadInsts) {
- int AddrIdx = NVPTX::getNamedOperandIdx(LI->getOpcode(), NVPTX::OpName::addr);
+ int AddrIdx =
+ NVPTX::getNamedOperandIdx(LI->getOpcode(), NVPTX::OpName::addr);
int AddspIdx =
NVPTX::getNamedOperandIdx(LI->getOpcode(), NVPTX::OpName::addsp);
assert(AddrIdx != -1 && AddspIdx != -1 && "Expected LD instruction");
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 374db8d2973b0..f379a35fd8e6f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1920,8 +1920,9 @@ multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
regclass:$dst4, regclass:$dst5, regclass:$dst6,
regclass:$dst7, regclass:$dst8),
(ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
- AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
- CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
+ AtomicCode:$Sign, i32imm:$fromWidth,
+ UsedBytesMask:$usedBytes, CacheHint:$cacheHint, ADDR:$addr,
+ CachePolicy:$policy),
"${usedBytes}"
"ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
"cacheHint:l2}${cacheHint:prefetch}.v8.${Sign:sign}$fromWidth "
diff --git a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
index cc548ccfa2607..2bd83e83c46de 100644
--- a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -35,7 +35,8 @@ class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
SmallVector<std::string, 8> ImageHandleList;
/// Per-MMO cache policy data for !mem.cache_hint metadata.
- /// Keyed by MachineMemOperand* for direct lookup during instruction selection.
+ /// Keyed by MachineMemOperand* for direct lookup during instruction
+ /// selection.
DenseMap<MachineMemOperand *, NVPTXMMOCachePolicyData> CachePolicyMap;
public:
@@ -75,7 +76,8 @@ class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
/// Get cache policy data for a MachineMemOperand.
/// Returns nullptr if no data exists for this MMO.
- const NVPTXMMOCachePolicyData *getCachePolicyData(MachineMemOperand *MMO) const {
+ const NVPTXMMOCachePolicyData *
+ getCachePolicyData(MachineMemOperand *MMO) const {
auto It = CachePolicyMap.find(MMO);
if (It == CachePolicyMap.end())
return nullptr;
>From 3b185654ceb5ea5c76d37aa35066a662239a3af5 Mon Sep 17 00:00:00 2001
From: Fei Peng <feip at nvidia.com>
Date: Thu, 15 Jan 2026 12:41:03 -0800
Subject: [PATCH 06/10] [NVPTX] Rename CacheHint to CacheControlHint for
clarity
Rename variables and functions to clarify the distinction between:
- CacheControlHint: the encoded value from !mem.cache_hint metadata
(L1/L2 eviction policies, L2 prefetch size)
- CachePolicy: the i64 value for L2::cache_hint PTX qualifier
Also adds a comment explaining that cache control hints are preferred
over .nc (LDG) since they can express what .nc does and more.
Renames include:
- getCacheHintFromMetadata -> getCacheControlHintFromMetadata
- getCacheHint -> getCacheControlHint
- getCacheHintAndPolicyReg -> getCacheControlHintAndPolicyReg
- encodeCacheHint -> encodeCacheControlHint
- CacheHint operand -> CacheControlHint in TableGen
- printCacheHint -> printCacheControlHint
---
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 5 +-
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.h | 4 +-
llvm/lib/Target/NVPTX/NVPTX.h | 5 +-
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 70 +++++-----
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 14 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 13 +-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 122 ++++++++++--------
.../Target/NVPTX/NVPTXMachineFunctionInfo.h | 8 +-
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 12 +-
llvm/lib/Target/NVPTX/NVPTXUtilities.h | 7 +-
.../CodeGen/NVPTX/load-store-cache-hint.ll | 29 +++++
11 files changed, 175 insertions(+), 114 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index df739978ac3ae..025a1ddaa6e0f 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -363,8 +363,9 @@ void NVPTXInstPrinter::printAtomicCode(const MCInst *MI, int OpNum,
llvm_unreachable(formatv("Unknown Modifier: {}", Modifier).str().c_str());
}
-void NVPTXInstPrinter::printCacheHint(const MCInst *MI, int OpNum,
- raw_ostream &O, StringRef Modifier) {
+void NVPTXInstPrinter::printCacheControlHint(const MCInst *MI, int OpNum,
+ raw_ostream &O,
+ StringRef Modifier) {
const MCOperand &MO = MI->getOperand(OpNum);
unsigned Hint = MO.getImm();
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
index f1cc1e5ec979d..091d22f2881dd 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h
@@ -42,8 +42,8 @@ class NVPTXInstPrinter : public MCInstPrinter {
StringRef Modifier = {});
void printAtomicCode(const MCInst *MI, int OpNum, raw_ostream &O,
StringRef Modifier = {});
- void printCacheHint(const MCInst *MI, int OpNum, raw_ostream &O,
- StringRef Modifier = {});
+ void printCacheControlHint(const MCInst *MI, int OpNum, raw_ostream &O,
+ StringRef Modifier = {});
void printCachePolicy(const MCInst *MI, int OpNum, raw_ostream &O);
void printMmaCode(const MCInst *MI, int OpNum, raw_ostream &O,
StringRef Modifier = {});
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index c23321e6634be..5049e5ad71a1e 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -228,7 +228,7 @@ enum class L2Prefetch : uint8_t {
Bytes256 = 3, // L2::256B
};
-// Bitfield layout for encoded cache hints (stored in unsigned):
+// Bitfield layout for encoded cache control hints (stored in unsigned):
// Bits 0-2: L1 Eviction (3 bits, 5 values)
// Bits 3-4: L2 Eviction (2 bits, 3 values)
// Bits 5-6: L2 Prefetch (2 bits, 4 values)
@@ -252,7 +252,8 @@ constexpr unsigned L2PrefetchShift = 5;
constexpr unsigned L2PrefetchMask = 0x3;
constexpr unsigned L2CacheHintFlag = 0x80; // Bit 7: L2::cache_hint mode
-inline unsigned encodeCacheHint(L1Eviction L1, L2Eviction L2, L2Prefetch P) {
+inline unsigned encodeCacheControlHint(L1Eviction L1, L2Eviction L2,
+ L2Prefetch P) {
unsigned Hint = 0;
Bitfield::set<L1EvictionBits>(Hint, L1);
Bitfield::set<L2EvictionBits>(Hint, L2);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 257b5cd8f13c7..52264eaa1cfce 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1112,10 +1112,10 @@ bool NVPTXDAGToDAGISel::SelectADDR(SDValue Addr, SDValue &Base,
return true;
}
-// Helper to extract cache hint from a MemSDNode via MMO lookup.
-// The cache hint is stored per-MMO by recordTargetMMOInfo().
-static unsigned getCacheHint(const MemSDNode *N,
- const NVPTXMachineFunctionInfo *MFI) {
+// Helper to extract cache control hint from a MemSDNode via MMO lookup.
+// The cache control hint is stored per-MMO by recordTargetMMOInfo().
+static unsigned getCacheControlHint(const MemSDNode *N,
+ const NVPTXMachineFunctionInfo *MFI) {
MachineMemOperand *MMO = N->getMemOperand();
if (!MMO || !MFI)
return 0;
@@ -1124,7 +1124,7 @@ static unsigned getCacheHint(const MemSDNode *N,
if (!Data)
return 0;
- return Data->CacheHint;
+ return Data->CacheControlHint;
}
// Helper to get cache policy value if present (for L2::cache_hint mode).
@@ -1140,17 +1140,17 @@ getCachePolicy(const MemSDNode *N, const NVPTXMachineFunctionInfo *MFI) {
return std::nullopt;
// Only return policy if L2CacheHintFlag is set (indicating policy mode)
- if (!(Data->CacheHint & NVPTX::L2CacheHintFlag))
+ if (!(Data->CacheControlHint & NVPTX::L2CacheHintFlag))
return std::nullopt;
return Data->Policy;
}
-std::pair<unsigned, SDValue> NVPTXDAGToDAGISel::getCacheHintAndPolicyReg(
+std::pair<unsigned, SDValue> NVPTXDAGToDAGISel::getCacheControlHintAndPolicyReg(
const MemSDNode *N, unsigned CodeAddrSpace, const SDLoc &DL) {
- // Extract cache hint from MMO flags
+ // Extract cache control hint from MMO flags
auto *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
- unsigned CacheHint = getCacheHint(N, MFI);
+ unsigned CacheControlHint = getCacheControlHint(N, MFI);
SDValue PolicyReg;
// Apply SM version guards for cache hints (from PTX ISA documentation):
@@ -1162,16 +1162,16 @@ std::pair<unsigned, SDValue> NVPTXDAGToDAGISel::getCacheHintAndPolicyReg(
// Check L1 eviction hint (SM 70+)
if (!Subtarget->hasL1EvictionHint()) {
- CacheHint &= ~(NVPTX::L1EvictionMask << NVPTX::L1EvictionShift);
+ CacheControlHint &= ~(NVPTX::L1EvictionMask << NVPTX::L1EvictionShift);
}
// Check L2 eviction hint (SM 70+)
if (!Subtarget->hasL2EvictionHint()) {
- CacheHint &= ~(NVPTX::L2EvictionMask << NVPTX::L2EvictionShift);
+ CacheControlHint &= ~(NVPTX::L2EvictionMask << NVPTX::L2EvictionShift);
}
// Check L2 prefetch hints (SM 75+ for 64B/128B, SM 80+ for 256B)
- auto L2Prefetch = NVPTX::decodeL2Prefetch(CacheHint);
+ auto L2Prefetch = NVPTX::decodeL2Prefetch(CacheControlHint);
if (L2Prefetch != NVPTX::L2Prefetch::None) {
bool PrefetchSupported = false;
switch (L2Prefetch) {
@@ -1189,16 +1189,16 @@ std::pair<unsigned, SDValue> NVPTXDAGToDAGISel::getCacheHintAndPolicyReg(
}
if (!PrefetchSupported) {
// Clear the prefetch bits if not supported
- CacheHint &= ~(NVPTX::L2PrefetchMask << NVPTX::L2PrefetchShift);
+ CacheControlHint &= ~(NVPTX::L2PrefetchMask << NVPTX::L2PrefetchShift);
}
}
// L2::cache_hint is only supported for global address space.
// Clear the flag for non-global address spaces.
if (CodeAddrSpace != NVPTX::AddressSpace::Global) {
- CacheHint &= ~NVPTX::L2CacheHintFlag;
+ CacheControlHint &= ~NVPTX::L2CacheHintFlag;
} else if (Subtarget->hasL2CacheHint()) {
- // Check for L2::cache_hint with cache-policy (requires SM 80+ and PTX 7.4+)
+ // Check for L2::cache_hint with cache policy (requires SM 80+ and PTX 7.4+)
if (auto CachePolicyVal = getCachePolicy(N, MFI)) {
SDValue PolicyConst =
CurDAG->getTargetConstant(*CachePolicyVal, DL, MVT::i64);
@@ -1211,10 +1211,10 @@ std::pair<unsigned, SDValue> NVPTXDAGToDAGISel::getCacheHintAndPolicyReg(
// If no policy or L2::cache_hint not supported, use NOREG and clear flag
if (!PolicyReg) {
PolicyReg = CurDAG->getRegister(NVPTX::NoRegister, MVT::i64);
- CacheHint &= ~NVPTX::L2CacheHintFlag;
+ CacheControlHint &= ~NVPTX::L2CacheHintFlag;
}
- return {CacheHint, PolicyReg};
+ return {CacheControlHint, PolicyReg};
}
bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
@@ -1228,7 +1228,11 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
// Address Space Setting
const auto CodeAddrSpace = getAddrSpace(LD);
- if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
+ auto *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
+ // Prefer cache control hints over .nc (LDG) since cache control hints can
+ // express what .nc does and more.
+ if (getCacheControlHint(LD, MFI) == 0 &&
+ canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
return tryLDG(LD);
SDLoc DL(LD);
@@ -1261,8 +1265,8 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
const MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
- const auto [CacheHint, PolicyReg] =
- getCacheHintAndPolicyReg(LD, CodeAddrSpace, DL);
+ const auto [CacheControlHint, PolicyReg] =
+ getCacheControlHintAndPolicyReg(LD, CodeAddrSpace, DL);
// Create the machine instruction DAG
SDValue Ops[] = {getI32Imm(Ordering, DL),
@@ -1271,7 +1275,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
getI32Imm(FromType, DL),
getI32Imm(FromTypeWidth, DL),
getI32Imm(UsedBytesMask, DL),
- getI32Imm(CacheHint, DL),
+ getI32Imm(CacheControlHint, DL),
Base,
Offset,
PolicyReg,
@@ -1311,7 +1315,11 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
// Address Space Setting
const auto CodeAddrSpace = getAddrSpace(LD);
- if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
+ auto *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
+ // Prefer cache control hints over .nc (LDG) since cache control hints can
+ // express what .nc does and more.
+ if (getCacheControlHint(LD, MFI) == 0 &&
+ canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
return tryLDG(LD);
const MVT EltVT = LD->getSimpleValueType(0);
@@ -1337,8 +1345,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
assert(!(EltVT.isVector() && ExtensionType != ISD::NON_EXTLOAD));
- const auto [CacheHint, PolicyReg] =
- getCacheHintAndPolicyReg(LD, CodeAddrSpace, DL);
+ const auto [CacheControlHint, PolicyReg] =
+ getCacheControlHintAndPolicyReg(LD, CodeAddrSpace, DL);
const auto [Base, Offset] = selectADDR(N->getOperand(1), CurDAG);
SDValue Ops[] = {getI32Imm(Ordering, DL),
getI32Imm(Scope, DL),
@@ -1346,7 +1354,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
getI32Imm(FromType, DL),
getI32Imm(FromTypeWidth, DL),
getI32Imm(UsedBytesMask, DL),
- getI32Imm(CacheHint, DL),
+ getI32Imm(CacheControlHint, DL),
Base,
Offset,
PolicyReg,
@@ -1528,15 +1536,15 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
const auto [Base, Offset] = selectADDR(ST->getBasePtr(), CurDAG);
// Extract cache hint and policy register
- const auto [CacheHint, PolicyReg] =
- getCacheHintAndPolicyReg(ST, CodeAddrSpace, DL);
+ const auto [CacheControlHint, PolicyReg] =
+ getCacheControlHintAndPolicyReg(ST, CodeAddrSpace, DL);
SDValue Ops[] = {selectPossiblyImm(Value),
getI32Imm(Ordering, DL),
getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL),
getI32Imm(ToTypeWidth, DL),
- getI32Imm(CacheHint, DL),
+ getI32Imm(CacheControlHint, DL),
Base,
Offset,
PolicyReg,
@@ -1586,13 +1594,13 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
TotalWidth <= 256 && "Invalid width for store");
// Extract cache hint and policy register
- const auto [CacheHint, PolicyReg] =
- getCacheHintAndPolicyReg(ST, CodeAddrSpace, DL);
+ const auto [CacheControlHint, PolicyReg] =
+ getCacheControlHintAndPolicyReg(ST, CodeAddrSpace, DL);
const auto [Base, Offset] = selectADDR(Addr, CurDAG);
Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
getI32Imm(CodeAddrSpace, DL), getI32Imm(ToTypeWidth, DL),
- getI32Imm(CacheHint, DL), Base, Offset, PolicyReg, Chain});
+ getI32Imm(CacheControlHint, DL), Base, Offset, PolicyReg, Chain});
const MVT::SimpleValueType EltVT =
ST->getOperand(1).getSimpleValueType().SimpleTy;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index ab8fafa2d9764..b2e7563ca3042 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -105,14 +105,14 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
SDValue getPTXCmpMode(const CondCodeSDNode &CondCode);
SDValue selectPossiblyImm(SDValue V);
- // Returns the cache hint and policy register for a memory operation.
+ // Returns the cache control hint and policy register for a memory operation.
// If L2::cache_hint mode is active (SM 80+, PTX 7.4+, global address space),
- // returns the updated cache hint with L2CacheHintFlag set and a register
- // containing the 64-bit policy value. Otherwise returns the original hint
- // and NOREG.
- std::pair<unsigned, SDValue> getCacheHintAndPolicyReg(const MemSDNode *N,
- unsigned CodeAddrSpace,
- const SDLoc &DL);
+ // returns the updated cache control hint with L2CacheHintFlag set and a
+ // register containing the 64-bit policy value. Otherwise returns the original
+ // hint and NOREG.
+ std::pair<unsigned, SDValue>
+ getCacheControlHintAndPolicyReg(const MemSDNode *N, unsigned CodeAddrSpace,
+ const SDLoc &DL);
// Returns the Memory Order and Scope that the PTX memory instruction should
// use, and inserts appropriate fence instruction before the memory
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 0dd720681985b..16a46f8432e88 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -7364,26 +7364,27 @@ void NVPTXTargetLowering::recordTargetMMOInfo(MachineFunction &MF,
if (!I.mayReadOrWriteMemory())
return;
- // Get cache hint from metadata using the specified operand number.
+ // Get cache control hint from metadata using the specified operand number.
// For load/store: operand_no = 0
// For memcpy: operand_no = 0 (dest/store), operand_no = 1 (src/load)
- unsigned CacheHint = NVPTX::getCacheHintFromMetadata(&I, OperandNo);
+ unsigned CacheControlHint =
+ NVPTX::getCacheControlHintFromMetadata(&I, OperandNo);
- // Check for cache_policy (L2::cache_hint mode)
+ // Check for cache policy (L2::cache_hint mode)
uint64_t CachePolicy = 0;
if (auto Policy = NVPTX::getCachePolicyFromMetadata(&I, OperandNo)) {
CachePolicy = *Policy;
// Set the L2CacheHintFlag to indicate policy mode
- CacheHint |= NVPTX::L2CacheHintFlag;
+ CacheControlHint |= NVPTX::L2CacheHintFlag;
}
// If no cache hints, nothing to store
- if (CacheHint == 0 && CachePolicy == 0)
+ if (CacheControlHint == 0 && CachePolicy == 0)
return;
// Store in MachineFunctionInfo keyed by MMO pointer
auto *MFI = MF.getInfo<NVPTXMachineFunctionInfo>();
- MFI->setCachePolicyData(MMO, CachePolicy, CacheHint);
+ MFI->setCachePolicyData(MMO, CachePolicy, CacheControlHint);
}
NVPTXTargetLowering::AtomicExpansionKind
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f379a35fd8e6f..53099a0dc12d2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1608,7 +1608,9 @@ def AtomicCode : Operand<i32> {
let PrintMethod = "printAtomicCode";
}
-def CacheHint : Operand<i32> { let PrintMethod = "printCacheHint"; }
+def CacheControlHint : Operand<i32> {
+ let PrintMethod = "printCacheControlHint";
+}
def MmaCode : Operand<i32> {
let PrintMethod = "printMmaCode";
@@ -1852,15 +1854,17 @@ def Callseq_End :
def CachePolicy : Operand<i64> { let PrintMethod = "printCachePolicy"; }
class LD<NVPTXRegClass regclass>
- : NVPTXInst<(outs regclass:$dst),
- (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
- AtomicCode:$Sign, i32imm:$fromWidth,
- UsedBytesMask:$usedBytes, CacheHint:$cacheHint, ADDR:$addr,
- CachePolicy:$policy),
- "${usedBytes}"
- "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
- "cacheHint:l2}${cacheHint:prefetch}.${Sign:sign}$fromWidth "
- "\t$dst, [$addr]${policy};"> {
+ : NVPTXInst<
+ (outs regclass:$dst),
+ (ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+ AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
+ CacheControlHint:$cacheControlHint, ADDR:$addr,
+ CachePolicy:$policy),
+ "${usedBytes}"
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheControlHint:l1}${"
+ "cacheControlHint:l2}${cacheControlHint:prefetch}.${Sign:sign}$"
+ "fromWidth "
+ "\t$dst, [$addr]${policy};"> {
let UseNamedOperandTable = 1;
}
@@ -1871,13 +1875,14 @@ let mayLoad=1, hasSideEffects=0 in {
}
class ST<DAGOperand O>
- : NVPTXInst<(outs),
- (ins O:$src, AtomicCode:$sem, AtomicCode:$scope,
- AtomicCode:$addsp, i32imm:$toWidth, CacheHint:$cacheHint,
- ADDR:$addr, CachePolicy:$policy),
- "st${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
- "cacheHint:l2}${cacheHint:prefetch}.b$toWidth"
- " \t[$addr], $src${policy};"> {
+ : NVPTXInst<
+ (outs),
+ (ins O:$src, AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
+ i32imm:$toWidth, CacheControlHint:$cacheControlHint, ADDR:$addr,
+ CachePolicy:$policy),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${cacheControlHint:l1}${"
+ "cacheControlHint:l2}${cacheControlHint:prefetch}.b$toWidth"
+ " \t[$addr], $src${policy};"> {
let UseNamedOperandTable = 1;
}
@@ -1897,10 +1902,12 @@ multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
(outs regclass:$dst1, regclass:$dst2),
(ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
- CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
+ CacheControlHint:$cacheControlHint, ADDR:$addr,
+ CachePolicy:$policy),
"${usedBytes}"
- "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
- "cacheHint:l2}${cacheHint:prefetch}.v2.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheControlHint:l1}${"
+ "cacheControlHint:l2}${cacheControlHint:prefetch}.v2.${Sign:sign}"
+ "$fromWidth "
"\t{{$dst1, $dst2}}, [$addr]${policy};">;
def _v4
: NVPTXInst<
@@ -1908,10 +1915,12 @@ multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
regclass:$dst4),
(ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
AtomicCode:$Sign, i32imm:$fromWidth, UsedBytesMask:$usedBytes,
- CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
+ CacheControlHint:$cacheControlHint, ADDR:$addr,
+ CachePolicy:$policy),
"${usedBytes}"
- "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
- "cacheHint:l2}${cacheHint:prefetch}.v4.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheControlHint:l1}${"
+ "cacheControlHint:l2}${cacheControlHint:prefetch}.v4.${Sign:sign}"
+ "$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr]${policy};">;
if support_v8 then
def _v8
@@ -1921,11 +1930,14 @@ multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
regclass:$dst7, regclass:$dst8),
(ins AtomicCode:$sem, AtomicCode:$scope, AtomicCode:$addsp,
AtomicCode:$Sign, i32imm:$fromWidth,
- UsedBytesMask:$usedBytes, CacheHint:$cacheHint, ADDR:$addr,
+ UsedBytesMask:$usedBytes,
+ CacheControlHint:$cacheControlHint, ADDR:$addr,
CachePolicy:$policy),
"${usedBytes}"
- "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
- "cacheHint:l2}${cacheHint:prefetch}.v8.${Sign:sign}$fromWidth "
+ "ld${sem:sem}${scope:scope}${addsp:addsp}${cacheControlHint:l1}"
+ "${"
+ "cacheControlHint:l2}${cacheControlHint:prefetch}.v8.${Sign:"
+ "sign}$fromWidth "
"\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, "
"[$addr]${policy};">;
}
@@ -1938,35 +1950,41 @@ let mayLoad=1, hasSideEffects=0 in {
multiclass ST_VEC<DAGOperand O, bit support_v8 = false> {
def _v2
- : NVPTXInst<(outs),
- (ins O:$src1, O:$src2, AtomicCode:$sem, AtomicCode:$scope,
- AtomicCode:$addsp, i32imm:$fromWidth,
- CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
- "st${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
- "cacheHint:l2}${cacheHint:prefetch}.v2.b$fromWidth "
- "\t[$addr], {{$src1, $src2}}${policy};">;
+ : NVPTXInst<
+ (outs),
+ (ins O:$src1, O:$src2, AtomicCode:$sem, AtomicCode:$scope,
+ AtomicCode:$addsp, i32imm:$fromWidth,
+ CacheControlHint:$cacheControlHint, ADDR:$addr,
+ CachePolicy:$policy),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${cacheControlHint:l1}${"
+ "cacheControlHint:l2}${cacheControlHint:prefetch}.v2.b$fromWidth "
+ "\t[$addr], {{$src1, $src2}}${policy};">;
def _v4
- : NVPTXInst<(outs),
- (ins RegOrSink:$src1, RegOrSink:$src2, RegOrSink:$src3,
- RegOrSink:$src4, AtomicCode:$sem, AtomicCode:$scope,
- AtomicCode:$addsp, i32imm:$fromWidth,
- CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
- "st${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
- "cacheHint:l2}${cacheHint:prefetch}.v4.b$fromWidth "
- "\t[$addr], {{$src1, $src2, $src3, $src4}}${policy};">;
+ : NVPTXInst<
+ (outs),
+ (ins RegOrSink:$src1, RegOrSink:$src2, RegOrSink:$src3,
+ RegOrSink:$src4, AtomicCode:$sem, AtomicCode:$scope,
+ AtomicCode:$addsp, i32imm:$fromWidth,
+ CacheControlHint:$cacheControlHint, ADDR:$addr,
+ CachePolicy:$policy),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${cacheControlHint:l1}${"
+ "cacheControlHint:l2}${cacheControlHint:prefetch}.v4.b$fromWidth "
+ "\t[$addr], {{$src1, $src2, $src3, $src4}}${policy};">;
if support_v8 then
def _v8
- : NVPTXInst<(outs),
- (ins RegOrSink:$src1, RegOrSink:$src2, RegOrSink:$src3,
- RegOrSink:$src4, RegOrSink:$src5, RegOrSink:$src6,
- RegOrSink:$src7, RegOrSink:$src8, AtomicCode:$sem,
- AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
- CacheHint:$cacheHint, ADDR:$addr, CachePolicy:$policy),
- "st${sem:sem}${scope:scope}${addsp:addsp}${cacheHint:l1}${"
- "cacheHint:l2}${cacheHint:prefetch}.v8.b$fromWidth "
- "\t[$addr], "
- "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, "
- "$src8}}${policy};">;
+ : NVPTXInst<
+ (outs),
+ (ins RegOrSink:$src1, RegOrSink:$src2, RegOrSink:$src3,
+ RegOrSink:$src4, RegOrSink:$src5, RegOrSink:$src6,
+ RegOrSink:$src7, RegOrSink:$src8, AtomicCode:$sem,
+ AtomicCode:$scope, AtomicCode:$addsp, i32imm:$fromWidth,
+ CacheControlHint:$cacheControlHint, ADDR:$addr,
+ CachePolicy:$policy),
+ "st${sem:sem}${scope:scope}${addsp:addsp}${cacheControlHint:l1}${"
+ "cacheControlHint:l2}${cacheControlHint:prefetch}.v8.b$fromWidth "
+ "\t[$addr], "
+ "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, "
+ "$src8}}${policy};">;
}
let mayStore=1, hasSideEffects=0 in {
diff --git a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
index 2bd83e83c46de..9378885d6fdbf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -24,8 +24,8 @@ namespace llvm {
// Stored per-MMO to avoid pointer collisions when multiple memops share
// the same pointer value but have different cache policies.
struct NVPTXMMOCachePolicyData {
- uint64_t Policy; // The 64-bit cache policy value for L2::cache_hint
- unsigned CacheHint; // Other cache hints (L1 eviction, L2 eviction, prefetch)
+ uint64_t Policy; // The 64-bit cache policy value for L2::cache_hint
+ unsigned CacheControlHint; // Cache control hints (L1/L2 eviction, prefetch)
};
class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
@@ -70,8 +70,8 @@ class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
/// Store cache policy data for a MachineMemOperand.
void setCachePolicyData(MachineMemOperand *MMO, uint64_t Policy,
- unsigned CacheHint) {
- CachePolicyMap[MMO] = {Policy, CacheHint};
+ unsigned CacheControlHint) {
+ CachePolicyMap[MMO] = {Policy, CacheControlHint};
}
/// Get cache policy data for a MachineMemOperand.
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 3ff0e7ecf8411..1e4fc091e7bfa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -419,7 +419,8 @@ static std::optional<L2Prefetch> parseL2Prefetch(StringRef Str) {
// Each node contains key-value pairs (operand_no can be anywhere):
// !node = !{!"operand_no", i32 N, !"nvvm.key1", value1, ...}
// Returns the matching MDNode or nullptr if not found.
-static const MDNode *findCacheHintNode(const MDNode *MD, unsigned OperandNo) {
+static const MDNode *findCacheControlHintNode(const MDNode *MD,
+ unsigned OperandNo) {
if (!MD)
return nullptr;
@@ -459,12 +460,13 @@ static const MDNode *findCacheHintNode(const MDNode *MD, unsigned OperandNo) {
return nullptr;
}
-unsigned getCacheHintFromMetadata(const Instruction *I, unsigned OperandNo) {
+unsigned getCacheControlHintFromMetadata(const Instruction *I,
+ unsigned OperandNo) {
if (!I)
return 0;
MDNode *MD = I->getMetadata("mem.cache_hint");
- const MDNode *Node = findCacheHintNode(MD, OperandNo);
+ const MDNode *Node = findCacheControlHintNode(MD, OperandNo);
if (!Node)
return 0;
@@ -520,7 +522,7 @@ unsigned getCacheHintFromMetadata(const Instruction *I, unsigned OperandNo) {
// Unknown keys are silently ignored (may be target-specific extensions)
}
- return encodeCacheHint(L1, L2, Prefetch);
+ return encodeCacheControlHint(L1, L2, Prefetch);
}
std::optional<uint64_t> getCachePolicyFromMetadata(const Instruction *I,
@@ -529,7 +531,7 @@ std::optional<uint64_t> getCachePolicyFromMetadata(const Instruction *I,
return std::nullopt;
MDNode *MD = I->getMetadata("mem.cache_hint");
- const MDNode *Node = findCacheHintNode(MD, OperandNo);
+ const MDNode *Node = findCacheControlHintNode(MD, OperandNo);
if (!Node)
return std::nullopt;
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index 39fc46ca1ec8b..c7ba5d2c5a919 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -206,10 +206,11 @@ inline raw_ostream &operator<<(raw_ostream &O, AddressSpace A) {
}
/// Parse !mem.cache_hint metadata from an instruction.
-/// Returns the encoded cache hint value, or 0 if no valid metadata is present.
-/// The OperandNo parameter specifies which pointer operand to look for
+/// Returns the encoded cache control hint value, or 0 if no valid metadata is
+/// present. The OperandNo parameter specifies which pointer operand to look for
/// (for instructions with multiple pointer operands like memcpy).
-unsigned getCacheHintFromMetadata(const Instruction *I, unsigned OperandNo = 0);
+unsigned getCacheControlHintFromMetadata(const Instruction *I,
+ unsigned OperandNo = 0);
/// Returns the L2::cache_hint value from !mem.cache_hint metadata, or
/// std::nullopt if no nvvm.l2_cache_hint is specified. The value is a 64-bit
diff --git a/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll b/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
index abb5f902fb39f..6a0acadcb10bd 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
@@ -321,6 +321,35 @@ define void @test_store_v2f64_l1_no_allocate(ptr addrspace(1) %p, <2 x double> %
ret void
}
+;-----------------------------------------------------------------------------
+; Invariant loads with cache hints - should NOT use LDG (ld.global.nc)
+;-----------------------------------------------------------------------------
+
+; CHECK-LABEL: test_invariant_load_with_hint
+; CHECK: ld.global.L1::evict_first.b32
+; CHECK-NOT: ld.global.nc
+define i32 @test_invariant_load_with_hint(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !invariant.load !{}, !mem.cache_hint !0
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_invariant_load_with_cache_policy
+; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 12345
+; CHECK: ld.global.L2::cache_hint.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}], [[POLICY]]
+; CHECK-NOT: ld.global.nc
+define i32 @test_invariant_load_with_cache_policy(ptr addrspace(1) %p) {
+ %v = load i32, ptr addrspace(1) %p, !invariant.load !{}, !mem.cache_hint !30
+ ret i32 %v
+}
+
+; CHECK-LABEL: test_invariant_load_v2i32_with_hint
+; CHECK: ld.global.L1::evict_last.L2::evict_first.v2.b32
+; CHECK-NOT: ld.global.nc
+define <2 x i32> @test_invariant_load_v2i32_with_hint(ptr addrspace(1) %p) {
+ %v = load <2 x i32>, ptr addrspace(1) %p, !invariant.load !{}, !mem.cache_hint !22
+ ret <2 x i32> %v
+}
+
;-----------------------------------------------------------------------------
; No hint should produce plain load/store
;-----------------------------------------------------------------------------
>From eb6c2f211c5f7ebf6c2c81adae04ef1d36c7c395 Mon Sep 17 00:00:00 2001
From: Fei Peng <feip at nvidia.com>
Date: Tue, 20 Jan 2026 14:38:54 -0800
Subject: [PATCH 07/10] [NVPTX] Add mem.cache_hint to FixedMetadataKinds and IR
Verifier
Address review feedback:
- Add MD_mem_cache_hint to FixedMetadataKinds.def for faster metadata
lookup instead of string-based lookup
- Add generic validation in IR Verifier for !mem.cache_hint metadata
structure (validates keys are strings, operand_no is integer, etc.)
- Simplify NVPTXUtilities.cpp parsing code by removing LLVM_DEBUG
statements since validation is now handled by the Verifier
---
llvm/include/llvm/IR/FixedMetadataKinds.def | 1 +
llvm/lib/IR/Verifier.cpp | 36 ++++++++++
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 69 ++++---------------
.../CodeGen/NVPTX/load-store-cache-hint.ll | 2 +-
4 files changed, 53 insertions(+), 55 deletions(-)
diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def
index 98129985714b2..c98eb39794e22 100644
--- a/llvm/include/llvm/IR/FixedMetadataKinds.def
+++ b/llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -59,3 +59,4 @@ LLVM_FIXED_MD_KIND(MD_captures, "captures", 44)
LLVM_FIXED_MD_KIND(MD_alloc_token, "alloc_token", 45)
LLVM_FIXED_MD_KIND(MD_implicit_ref, "implicit.ref", 46)
LLVM_FIXED_MD_KIND(MD_nofpclass, "nofpclass", 47)
+LLVM_FIXED_MD_KIND(MD_mem_cache_hint, "mem.cache_hint", 47)
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 3f0f6c127b456..d6657d46af1c6 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -546,6 +546,7 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
void visitAccessGroupMetadata(const MDNode *MD);
void visitCapturesMetadata(Instruction &I, const MDNode *Captures);
void visitAllocTokenMetadata(Instruction &I, MDNode *MD);
+ void visitMemCacheHintMetadata(Instruction &I, MDNode *MD);
template <class Ty> bool isValidMetadataArray(const MDTuple &N);
#define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS) void visit##CLASS(const CLASS &N);
@@ -5567,6 +5568,38 @@ void Verifier::visitAllocTokenMetadata(Instruction &I, MDNode *MD) {
"expected integer constant", MD);
}
+void Verifier::visitMemCacheHintMetadata(Instruction &I, MDNode *MD) {
+ Check(I.mayReadOrWriteMemory(),
+ "!mem.cache_hint is only valid on memory operations", &I);
+
+ // Top-level metadata is an array of operand-specific nodes
+ for (const MDOperand &Op : MD->operands()) {
+ Check(Op, "!mem.cache_hint operand must not be null", MD);
+ const auto *Node = dyn_cast<MDNode>(Op);
+ Check(Node, "!mem.cache_hint operand must be a metadata node", MD);
+
+ // Each node contains key-value pairs with even number of operands
+ Check(Node->getNumOperands() % 2 == 0,
+ "!mem.cache_hint node must have even number of operands (key-value "
+ "pairs)",
+ Node);
+
+ // Validate that keys are strings; values are target-specific
+ for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
+ const auto *Key = dyn_cast<MDString>(Node->getOperand(j));
+ Check(Key, "!mem.cache_hint key must be a string", Node);
+
+ // operand_no is a generic key that must be an integer
+ if (Key->getString() == "operand_no") {
+ auto *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(j + 1));
+ Check(CI, "!mem.cache_hint 'operand_no' must be an integer constant",
+ Node);
+ }
+ // Other keys are target-specific; their values are not validated here
+ }
+ }
+}
+
/// verifyInstruction - Verify that an instruction is well formed.
///
void Verifier::visitInstruction(Instruction &I) {
@@ -5799,6 +5832,9 @@ void Verifier::visitInstruction(Instruction &I) {
if (MDNode *MD = I.getMetadata(LLVMContext::MD_alloc_token))
visitAllocTokenMetadata(I, MD);
+ if (MDNode *MD = I.getMetadata(LLVMContext::MD_mem_cache_hint))
+ visitMemCacheHintMetadata(I, MD);
+
if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
CheckDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
visitMDNode(*N, AreDebugLocsAllowed::Yes);
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 1e4fc091e7bfa..142a992b7840d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -24,7 +24,6 @@
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Alignment.h"
-#include "llvm/Support/Debug.h"
#include "llvm/Support/ModRef.h"
#include "llvm/Support/Mutex.h"
#include <cstdint>
@@ -35,8 +34,6 @@
#include <string>
#include <vector>
-#define DEBUG_TYPE "nvptx-utilities"
-
namespace llvm {
namespace {
@@ -426,35 +423,21 @@ static const MDNode *findCacheControlHintNode(const MDNode *MD,
for (const MDOperand &Op : MD->operands()) {
const auto *Node = dyn_cast<MDNode>(Op);
- if (!Node || Node->getNumOperands() < 2) {
- LLVM_DEBUG(if (Node) dbgs()
- << "NVPTX: Skipping malformed cache hint node with "
- << Node->getNumOperands() << " operands\n");
+ if (!Node)
continue;
- }
// Search for operand_no in the node (can be at any position)
- // Key-value pairs require index iteration with stride 2
- std::optional<unsigned> NodeOperandNo;
for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
const auto *Key = dyn_cast<MDString>(Node->getOperand(j));
if (Key && Key->getString() == "operand_no") {
if (auto *OpNoCI =
- mdconst::dyn_extract<ConstantInt>(Node->getOperand(j + 1)))
- NodeOperandNo = OpNoCI->getZExtValue();
- else
- LLVM_DEBUG(dbgs() << "NVPTX: operand_no value is not ConstantInt\n");
+ mdconst::dyn_extract<ConstantInt>(Node->getOperand(j + 1))) {
+ if (OpNoCI->getZExtValue() == OperandNo)
+ return Node;
+ }
break;
}
}
-
- if (!NodeOperandNo) {
- LLVM_DEBUG(dbgs() << "NVPTX: Cache hint node missing operand_no\n");
- continue;
- }
-
- if (*NodeOperandNo == OperandNo)
- return Node;
}
return nullptr;
@@ -465,7 +448,7 @@ unsigned getCacheControlHintFromMetadata(const Instruction *I,
if (!I)
return 0;
- MDNode *MD = I->getMetadata("mem.cache_hint");
+ MDNode *MD = I->getMetadata(LLVMContext::MD_mem_cache_hint);
const MDNode *Node = findCacheControlHintNode(MD, OperandNo);
if (!Node)
return 0;
@@ -474,50 +457,32 @@ unsigned getCacheControlHintFromMetadata(const Instruction *I,
L2Eviction L2 = L2Eviction::Normal;
L2Prefetch Prefetch = L2Prefetch::None;
- // Parse all key-value pairs from the matching node
- // Key-value pairs require index iteration with stride 2
+ // Parse all key-value pairs from the matching node.
+ // Metadata structure is validated by the IR Verifier.
for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
const auto *Key = dyn_cast<MDString>(Node->getOperand(j));
- if (!Key) {
- LLVM_DEBUG(dbgs() << "NVPTX: Cache hint key at index " << j
- << " is not a string\n");
+ if (!Key)
continue;
- }
StringRef KeyStr = Key->getString();
if (KeyStr == "operand_no")
- continue; // Already processed by findCacheHintNode
+ continue; // Already processed by findCacheControlHintNode
// For eviction and prefetch hints, value should be a string
const auto *Val = dyn_cast<MDString>(Node->getOperand(j + 1));
- if (!Val) {
- // nvvm.l2_cache_hint uses i64, not string - skip here
- if (KeyStr != "nvvm.l2_cache_hint") {
- LLVM_DEBUG(dbgs() << "NVPTX: Value for '" << KeyStr
- << "' is not a string\n");
- }
- continue;
- }
+ if (!Val)
+ continue; // nvvm.l2_cache_hint uses i64, handled separately
StringRef ValStr = Val->getString();
if (KeyStr == "nvvm.l1_eviction") {
if (auto Parsed = parseL1Eviction(ValStr))
L1 = *Parsed;
- else
- LLVM_DEBUG(dbgs() << "NVPTX: Unknown L1 eviction policy: " << ValStr
- << "\n");
} else if (KeyStr == "nvvm.l2_eviction") {
if (auto Parsed = parseL2Eviction(ValStr))
L2 = *Parsed;
- else
- LLVM_DEBUG(dbgs() << "NVPTX: Unknown L2 eviction policy: " << ValStr
- << "\n");
} else if (KeyStr == "nvvm.l2_prefetch_size") {
if (auto Parsed = parseL2Prefetch(ValStr))
Prefetch = *Parsed;
- else
- LLVM_DEBUG(dbgs() << "NVPTX: Unknown L2 prefetch size: " << ValStr
- << "\n");
}
// Unknown keys are silently ignored (may be target-specific extensions)
}
@@ -530,25 +495,21 @@ std::optional<uint64_t> getCachePolicyFromMetadata(const Instruction *I,
if (!I)
return std::nullopt;
- MDNode *MD = I->getMetadata("mem.cache_hint");
+ MDNode *MD = I->getMetadata(LLVMContext::MD_mem_cache_hint);
const MDNode *Node = findCacheControlHintNode(MD, OperandNo);
if (!Node)
return std::nullopt;
- // Look for nvvm.l2_cache_hint in the matching node
- // Key-value pairs require index iteration with stride 2
+ // Look for nvvm.l2_cache_hint in the matching node.
+ // Metadata structure is validated by the IR Verifier.
for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
const auto *Key = dyn_cast<MDString>(Node->getOperand(j));
if (!Key || Key->getString() != "nvvm.l2_cache_hint")
continue;
- // The value should be an i64 constant
if (auto *ValCI =
mdconst::dyn_extract<ConstantInt>(Node->getOperand(j + 1)))
return ValCI->getZExtValue();
-
- LLVM_DEBUG(
- dbgs() << "NVPTX: nvvm.l2_cache_hint value is not ConstantInt\n");
}
return std::nullopt;
diff --git a/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll b/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
index 6a0acadcb10bd..3b328ea408978 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
@@ -656,7 +656,7 @@ define i32 @test_load_cache_hint_null_value(ptr addrspace(1) %p) {
ret i32 %v
}
-; nvvm.l2_cache_hint with wrong type (i32 instead of i64) - should still work
+; nvvm.l2_cache_hint with i32 instead of i64 - should still work
; as mdconst::dyn_extract<ConstantInt> accepts any integer type
; CHECK-LABEL: test_load_cache_hint_i32_value
; CHECK: mov.b64 [[POLICY:%rd[0-9]+]], 99999
>From abdd72590f7666afd1a9529f76bbc63179b6d1ef Mon Sep 17 00:00:00 2001
From: Fei Peng <feip at nvidia.com>
Date: Tue, 20 Jan 2026 15:03:54 -0800
Subject: [PATCH 08/10] [NVPTX] Use Bitfield API consistently, remove legacy
mask constants
Replace manual bit manipulation with Bitfield::set<>/get<> throughout
the cache hint code and remove the now-unused mask/shift constants
(L1EvictionMask, L2EvictionMask, L2PrefetchMask, L2CacheHintFlag, etc.)
from NVPTX.h.
---
llvm/lib/Target/NVPTX/NVPTX.h | 9 ---------
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 17 ++++++++++-------
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 2 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 4 ++--
4 files changed, 13 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 5049e5ad71a1e..e2608706270ab 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -243,15 +243,6 @@ using L2PrefetchBits =
Bitfield::Element<L2Prefetch, 5, 2, L2Prefetch::Bytes256>;
using L2CacheHintBit = Bitfield::Element<bool, 7, 1>;
-// Masks for clearing/testing fields (for legacy code and instruction emission)
-constexpr unsigned L1EvictionShift = 0;
-constexpr unsigned L1EvictionMask = 0x7;
-constexpr unsigned L2EvictionShift = 3;
-constexpr unsigned L2EvictionMask = 0x3;
-constexpr unsigned L2PrefetchShift = 5;
-constexpr unsigned L2PrefetchMask = 0x3;
-constexpr unsigned L2CacheHintFlag = 0x80; // Bit 7: L2::cache_hint mode
-
inline unsigned encodeCacheControlHint(L1Eviction L1, L2Eviction L2,
L2Prefetch P) {
unsigned Hint = 0;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 52264eaa1cfce..c2ab534ff3c8a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1139,8 +1139,8 @@ getCachePolicy(const MemSDNode *N, const NVPTXMachineFunctionInfo *MFI) {
if (!Data)
return std::nullopt;
- // Only return policy if L2CacheHintFlag is set (indicating policy mode)
- if (!(Data->CacheControlHint & NVPTX::L2CacheHintFlag))
+ // Only return policy if L2CacheHintBit is set (indicating policy mode)
+ if (!NVPTX::isL2CacheHintMode(Data->CacheControlHint))
return std::nullopt;
return Data->Policy;
@@ -1162,12 +1162,14 @@ std::pair<unsigned, SDValue> NVPTXDAGToDAGISel::getCacheControlHintAndPolicyReg(
// Check L1 eviction hint (SM 70+)
if (!Subtarget->hasL1EvictionHint()) {
- CacheControlHint &= ~(NVPTX::L1EvictionMask << NVPTX::L1EvictionShift);
+ Bitfield::set<NVPTX::L1EvictionBits>(CacheControlHint,
+ NVPTX::L1Eviction::Normal);
}
// Check L2 eviction hint (SM 70+)
if (!Subtarget->hasL2EvictionHint()) {
- CacheControlHint &= ~(NVPTX::L2EvictionMask << NVPTX::L2EvictionShift);
+ Bitfield::set<NVPTX::L2EvictionBits>(CacheControlHint,
+ NVPTX::L2Eviction::Normal);
}
// Check L2 prefetch hints (SM 75+ for 64B/128B, SM 80+ for 256B)
@@ -1189,14 +1191,15 @@ std::pair<unsigned, SDValue> NVPTXDAGToDAGISel::getCacheControlHintAndPolicyReg(
}
if (!PrefetchSupported) {
// Clear the prefetch bits if not supported
- CacheControlHint &= ~(NVPTX::L2PrefetchMask << NVPTX::L2PrefetchShift);
+ Bitfield::set<NVPTX::L2PrefetchBits>(CacheControlHint,
+ NVPTX::L2Prefetch::None);
}
}
// L2::cache_hint is only supported for global address space.
// Clear the flag for non-global address spaces.
if (CodeAddrSpace != NVPTX::AddressSpace::Global) {
- CacheControlHint &= ~NVPTX::L2CacheHintFlag;
+ Bitfield::set<NVPTX::L2CacheHintBit>(CacheControlHint, false);
} else if (Subtarget->hasL2CacheHint()) {
// Check for L2::cache_hint with cache policy (requires SM 80+ and PTX 7.4+)
if (auto CachePolicyVal = getCachePolicy(N, MFI)) {
@@ -1211,7 +1214,7 @@ std::pair<unsigned, SDValue> NVPTXDAGToDAGISel::getCacheControlHintAndPolicyReg(
// If no policy or L2::cache_hint not supported, use NOREG and clear flag
if (!PolicyReg) {
PolicyReg = CurDAG->getRegister(NVPTX::NoRegister, MVT::i64);
- CacheControlHint &= ~NVPTX::L2CacheHintFlag;
+ Bitfield::set<NVPTX::L2CacheHintBit>(CacheControlHint, false);
}
return {CacheControlHint, PolicyReg};
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index b2e7563ca3042..5ae1f98e9aa37 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -107,7 +107,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
// Returns the cache control hint and policy register for a memory operation.
// If L2::cache_hint mode is active (SM 80+, PTX 7.4+, global address space),
- // returns the updated cache control hint with L2CacheHintFlag set and a
+ // returns the updated cache control hint with L2CacheHintBit set and a
// register containing the 64-bit policy value. Otherwise returns the original
// hint and NOREG.
std::pair<unsigned, SDValue>
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 16a46f8432e88..2f723c332a8f2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -7374,8 +7374,8 @@ void NVPTXTargetLowering::recordTargetMMOInfo(MachineFunction &MF,
uint64_t CachePolicy = 0;
if (auto Policy = NVPTX::getCachePolicyFromMetadata(&I, OperandNo)) {
CachePolicy = *Policy;
- // Set the L2CacheHintFlag to indicate policy mode
- CacheControlHint |= NVPTX::L2CacheHintFlag;
+ // Set the L2CacheHintBit to indicate policy mode
+ Bitfield::set<NVPTX::L2CacheHintBit>(CacheControlHint, true);
}
// If no cache hints, nothing to store
>From e1c7962bfe16e9e772a0dba0fda33312969b66f7 Mon Sep 17 00:00:00 2001
From: Fei Peng <feip at nvidia.com>
Date: Fri, 30 Jan 2026 00:17:06 -0800
Subject: [PATCH 09/10] Address review comments: fix isValid() and add verifier
tests
---
llvm/include/llvm/IR/FixedMetadataKinds.def | 2 +-
.../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 2 +-
llvm/test/Verifier/mem-cache-hint.ll | 37 +++++++++++++++++++
3 files changed, 39 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/Verifier/mem-cache-hint.ll
diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def
index c98eb39794e22..fa71e29366d0c 100644
--- a/llvm/include/llvm/IR/FixedMetadataKinds.def
+++ b/llvm/include/llvm/IR/FixedMetadataKinds.def
@@ -59,4 +59,4 @@ LLVM_FIXED_MD_KIND(MD_captures, "captures", 44)
LLVM_FIXED_MD_KIND(MD_alloc_token, "alloc_token", 45)
LLVM_FIXED_MD_KIND(MD_implicit_ref, "implicit.ref", 46)
LLVM_FIXED_MD_KIND(MD_nofpclass, "nofpclass", 47)
-LLVM_FIXED_MD_KIND(MD_mem_cache_hint, "mem.cache_hint", 47)
+LLVM_FIXED_MD_KIND(MD_mem_cache_hint, "mem.cache_hint", 48)
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 025a1ddaa6e0f..e8ef2c97ec2fa 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -436,7 +436,7 @@ void NVPTXInstPrinter::printCachePolicy(const MCInst *MI, int OpNum,
raw_ostream &O) {
const MCOperand &MO = MI->getOperand(OpNum);
// If the operand is a register and valid, print ", $reg"
- if (MO.isReg() && MO.getReg() != 0) {
+ if (MO.isReg() && MO.getReg().isValid()) {
O << ", ";
printRegName(O, MO.getReg());
}
diff --git a/llvm/test/Verifier/mem-cache-hint.ll b/llvm/test/Verifier/mem-cache-hint.ll
new file mode 100644
index 0000000000000..81b27a12fb2bd
--- /dev/null
+++ b/llvm/test/Verifier/mem-cache-hint.ll
@@ -0,0 +1,37 @@
+; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
+
+; CHECK: !mem.cache_hint is only valid on memory operations
+define void @non_memory_op(i32 %x, i32 %y) {
+ %z = add i32 %x, %y, !mem.cache_hint !{!{!"operand_no", i32 0}}
+ ret void
+}
+
+; CHECK: !mem.cache_hint operand must not be null
+define void @null_operand(ptr %p) {
+ %v = load i32, ptr %p, !mem.cache_hint !{null}
+ ret void
+}
+
+; CHECK: !mem.cache_hint operand must be a metadata node
+define void @operand_not_mdnode(ptr %p) {
+ %v = load i32, ptr %p, !mem.cache_hint !{!"not_a_node"}
+ ret void
+}
+
+; CHECK: !mem.cache_hint node must have even number of operands
+define void @odd_operands(ptr %p) {
+ %v = load i32, ptr %p, !mem.cache_hint !{!{!"operand_no", i32 0, !"extra"}}
+ ret void
+}
+
+; CHECK: !mem.cache_hint key must be a string
+define void @key_not_string(ptr %p) {
+ %v = load i32, ptr %p, !mem.cache_hint !{!{i32 0, i32 1}}
+ ret void
+}
+
+; CHECK: !mem.cache_hint 'operand_no' must be an integer constant
+define void @operand_no_not_integer(ptr %p) {
+ %v = load i32, ptr %p, !mem.cache_hint !{!{!"operand_no", !"zero"}}
+ ret void
+}
>From 8ca032572d844cd3630cdab8b50f915a65e92b3c Mon Sep 17 00:00:00 2001
From: Fei Peng <feip at nvidia.com>
Date: Sun, 8 Feb 2026 23:15:01 -0800
Subject: [PATCH 10/10] [NVPTX] Update mem.cache_hint format handling and tests
---
llvm/lib/IR/Verifier.cpp | 66 ++++--
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 42 ++--
.../CodeGen/NVPTX/cache-hint-sm-version.ll | 40 ++--
.../CodeGen/NVPTX/load-store-cache-hint.ll | 208 +++++++++---------
llvm/test/CodeGen/NVPTX/memcpy-cache-hint.ll | 114 +++++-----
llvm/test/DebugInfo/NVPTX/inlinedAt_2.mir | 4 +-
llvm/test/Verifier/mem-cache-hint.ll | 59 +++--
7 files changed, 295 insertions(+), 238 deletions(-)
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index d6657d46af1c6..ac2aa7cd4b24d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5572,30 +5572,64 @@ void Verifier::visitMemCacheHintMetadata(Instruction &I, MDNode *MD) {
Check(I.mayReadOrWriteMemory(),
"!mem.cache_hint is only valid on memory operations", &I);
- // Top-level metadata is an array of operand-specific nodes
- for (const MDOperand &Op : MD->operands()) {
- Check(Op, "!mem.cache_hint operand must not be null", MD);
- const auto *Node = dyn_cast<MDNode>(Op);
- Check(Node, "!mem.cache_hint operand must be a metadata node", MD);
+ Check(MD->getNumOperands() % 2 == 0,
+ "!mem.cache_hint must have even number of operands "
+ "(operand_no, hint_node pairs)",
+ MD);
+
+ auto IsMemoryObjectOperand = [](const Value *V) {
+ return V->getType()->isPtrOrPtrVectorTy();
+ };
+
+ unsigned NumMemoryObjectOperands = 0;
+ if (const auto *CB = dyn_cast<CallBase>(&I))
+ NumMemoryObjectOperands = count_if(CB->args(), [&](const Use &Arg) {
+ return IsMemoryObjectOperand(Arg.get());
+ });
+ else
+ NumMemoryObjectOperands = count_if(I.operands(), [&](const Use &Op) {
+ return IsMemoryObjectOperand(Op.get());
+ });
+
+ SmallVector<unsigned, 4> SeenOperandNos;
+
+ // Top-level metadata alternates: i32 operand_no, MDNode hint_node.
+ for (unsigned i = 0; i + 1 < MD->getNumOperands(); i += 2) {
+ auto *OpNoCI = mdconst::dyn_extract<ConstantInt>(MD->getOperand(i));
+ Check(OpNoCI,
+ "!mem.cache_hint operand_no must be an integer constant in pair", MD);
+
+ Check(OpNoCI->getValue().isNonNegative(),
+ "!mem.cache_hint operand_no must be non-negative", MD);
+
+ uint64_t OperandNo = OpNoCI->getZExtValue();
+ Check(OperandNo < NumMemoryObjectOperands,
+ "!mem.cache_hint operand_no must refer to a valid memory object "
+ "operand",
+ &I);
+
+ Check(!is_contained(SeenOperandNos, OperandNo),
+ "!mem.cache_hint contains duplicate operand_no", MD);
+ SeenOperandNos.push_back(OperandNo);
+
+ const auto *Node = dyn_cast<MDNode>(MD->getOperand(i + 1));
+ Check(Node, "!mem.cache_hint hint node must be a metadata node", MD);
- // Each node contains key-value pairs with even number of operands
Check(Node->getNumOperands() % 2 == 0,
- "!mem.cache_hint node must have even number of operands (key-value "
- "pairs)",
+ "!mem.cache_hint hint node must have even number of operands "
+ "(key-value pairs)",
Node);
- // Validate that keys are strings; values are target-specific
+ SmallVector<StringRef, 8> SeenKeys;
for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
const auto *Key = dyn_cast<MDString>(Node->getOperand(j));
Check(Key, "!mem.cache_hint key must be a string", Node);
- // operand_no is a generic key that must be an integer
- if (Key->getString() == "operand_no") {
- auto *CI = mdconst::dyn_extract<ConstantInt>(Node->getOperand(j + 1));
- Check(CI, "!mem.cache_hint 'operand_no' must be an integer constant",
- Node);
- }
- // Other keys are target-specific; their values are not validated here
+ StringRef KeyStr = Key->getString();
+ Check(!is_contained(SeenKeys, KeyStr),
+ "!mem.cache_hint hint node contains duplicate key", Node);
+ SeenKeys.push_back(KeyStr);
+ // Values are target-specific and not validated here.
}
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 142a992b7840d..7f8915b83fcc8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -410,34 +410,28 @@ static std::optional<L2Prefetch> parseL2Prefetch(StringRef Str) {
.Default(std::nullopt);
}
-// Helper to find the metadata node matching a specific operand number.
+// Helper to find the hint node matching a specific operand number.
// The metadata structure is:
-// !mem.cache_hint = !{!node1, !node2, ...}
-// Each node contains key-value pairs (operand_no can be anywhere):
-// !node = !{!"operand_no", i32 N, !"nvvm.key1", value1, ...}
-// Returns the matching MDNode or nullptr if not found.
+// !mem.cache_hint = !{ i32 opno0, !hints0, i32 opno1, !hints1, ... }
+// !hintsN = !{ !"nvvm.key1", value1, ... }
+// Returns the matching hints MDNode or nullptr if not found.
static const MDNode *findCacheControlHintNode(const MDNode *MD,
unsigned OperandNo) {
if (!MD)
return nullptr;
- for (const MDOperand &Op : MD->operands()) {
- const auto *Node = dyn_cast<MDNode>(Op);
- if (!Node)
+ unsigned NumOps = MD->getNumOperands();
+ if (NumOps % 2 != 0)
+ return nullptr;
+
+ for (unsigned i = 0; i + 1 < NumOps; i += 2) {
+ const auto *OpNoCI = mdconst::dyn_extract<ConstantInt>(MD->getOperand(i));
+ const auto *Node = dyn_cast<MDNode>(MD->getOperand(i + 1));
+ if (!OpNoCI || !Node || OpNoCI->getValue().isNegative())
continue;
- // Search for operand_no in the node (can be at any position)
- for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
- const auto *Key = dyn_cast<MDString>(Node->getOperand(j));
- if (Key && Key->getString() == "operand_no") {
- if (auto *OpNoCI =
- mdconst::dyn_extract<ConstantInt>(Node->getOperand(j + 1))) {
- if (OpNoCI->getZExtValue() == OperandNo)
- return Node;
- }
- break;
- }
- }
+ if (OpNoCI->getZExtValue() == OperandNo)
+ return Node;
}
return nullptr;
@@ -448,7 +442,7 @@ unsigned getCacheControlHintFromMetadata(const Instruction *I,
if (!I)
return 0;
- MDNode *MD = I->getMetadata(LLVMContext::MD_mem_cache_hint);
+ const MDNode *MD = I->getMetadata(LLVMContext::MD_mem_cache_hint);
const MDNode *Node = findCacheControlHintNode(MD, OperandNo);
if (!Node)
return 0;
@@ -458,15 +452,13 @@ unsigned getCacheControlHintFromMetadata(const Instruction *I,
L2Prefetch Prefetch = L2Prefetch::None;
// Parse all key-value pairs from the matching node.
- // Metadata structure is validated by the IR Verifier.
+ // Metadata structure is validated by the IR verifier.
for (unsigned j = 0; j + 1 < Node->getNumOperands(); j += 2) {
const auto *Key = dyn_cast<MDString>(Node->getOperand(j));
if (!Key)
continue;
StringRef KeyStr = Key->getString();
- if (KeyStr == "operand_no")
- continue; // Already processed by findCacheControlHintNode
// For eviction and prefetch hints, value should be a string
const auto *Val = dyn_cast<MDString>(Node->getOperand(j + 1));
@@ -495,7 +487,7 @@ std::optional<uint64_t> getCachePolicyFromMetadata(const Instruction *I,
if (!I)
return std::nullopt;
- MDNode *MD = I->getMetadata(LLVMContext::MD_mem_cache_hint);
+ const MDNode *MD = I->getMetadata(LLVMContext::MD_mem_cache_hint);
const MDNode *Node = findCacheControlHintNode(MD, OperandNo);
if (!Node)
return std::nullopt;
diff --git a/llvm/test/CodeGen/NVPTX/cache-hint-sm-version.ll b/llvm/test/CodeGen/NVPTX/cache-hint-sm-version.ll
index 8f1b03b5fde9e..8d0cae7af9590 100644
--- a/llvm/test/CodeGen/NVPTX/cache-hint-sm-version.ll
+++ b/llvm/test/CodeGen/NVPTX/cache-hint-sm-version.ll
@@ -306,41 +306,41 @@ define void @test_store_l1_no_allocate(ptr addrspace(1) %p, i32 %v) {
;-----------------------------------------------------------------------------
; L1 eviction: first
-!0 = !{!100}
-!100 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first"}
+!0 = !{i32 0, !100}
+!100 = !{!"nvvm.l1_eviction", !"first"}
; L2 eviction: last
-!1 = !{!101}
-!101 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"last"}
+!1 = !{i32 0, !101}
+!101 = !{!"nvvm.l2_eviction", !"last"}
; L2 prefetch: 128B
-!2 = !{!102}
-!102 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"128B"}
+!2 = !{i32 0, !102}
+!102 = !{!"nvvm.l2_prefetch_size", !"128B"}
; L2::cache_hint only
-!3 = !{!103}
-!103 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 12345}
+!3 = !{i32 0, !103}
+!103 = !{!"nvvm.l2_cache_hint", i64 12345}
; L2::cache_hint + L1 eviction
-!4 = !{!104}
-!104 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 44445, !"nvvm.l1_eviction", !"first"}
+!4 = !{i32 0, !104}
+!104 = !{!"nvvm.l2_cache_hint", i64 44445, !"nvvm.l1_eviction", !"first"}
; L2::cache_hint for store
-!5 = !{!105}
-!105 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 67890}
+!5 = !{i32 0, !105}
+!105 = !{!"nvvm.l2_cache_hint", i64 67890}
; L2 prefetch: 64B
-!6 = !{!106}
-!106 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"64B"}
+!6 = !{i32 0, !106}
+!106 = !{!"nvvm.l2_prefetch_size", !"64B"}
; L2 prefetch: 256B
-!7 = !{!107}
-!107 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"256B"}
+!7 = !{i32 0, !107}
+!107 = !{!"nvvm.l2_prefetch_size", !"256B"}
; L2 prefetch: 128B + L1 eviction
-!8 = !{!108}
-!108 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"128B", !"nvvm.l1_eviction", !"first"}
+!8 = !{i32 0, !108}
+!108 = !{!"nvvm.l2_prefetch_size", !"128B", !"nvvm.l1_eviction", !"first"}
; L1 eviction: no_allocate (for store)
-!9 = !{!109}
-!109 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"no_allocate"}
+!9 = !{i32 0, !109}
+!109 = !{!"nvvm.l1_eviction", !"no_allocate"}
diff --git a/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll b/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
index 3b328ea408978..ed98678f762aa 100644
--- a/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
+++ b/llvm/test/CodeGen/NVPTX/load-store-cache-hint.ll
@@ -607,11 +607,11 @@ define i32 @test_multiple_loads_same_ptr(ptr addrspace(1) %p) {
; Invalid/edge cases
;-----------------------------------------------------------------------------
-; Test with invalid operand_no - should be ignored
-; CHECK-LABEL: test_load_invalid_operand_no
+; Test with empty hint node - should produce plain load
+; CHECK-LABEL: test_load_empty_hint_node
; CHECK: ld.global.b32
; CHECK-NOT: L1::
-define i32 @test_load_invalid_operand_no(ptr addrspace(1) %p) {
+define i32 @test_load_empty_hint_node(ptr addrspace(1) %p) {
%v = load i32, ptr addrspace(1) %p, !mem.cache_hint !11
ret i32 %v
}
@@ -624,7 +624,7 @@ define i32 @test_load_unknown_key(ptr addrspace(1) %p) {
ret i32 %v
}
-; Test with reordered metadata (operand_no not first) - should still work
+; Test with custom hint key order - should still work
; CHECK-LABEL: test_load_reordered_metadata
; CHECK: ld.global.L1::evict_last.L2::evict_first.b32
define i32 @test_load_reordered_metadata(ptr addrspace(1) %p) {
@@ -697,166 +697,166 @@ define i32 @test_load_l2_normal(ptr addrspace(1) %p) {
;-----------------------------------------------------------------------------
; L1 eviction policies
-!0 = !{!100}
-!100 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first"}
+!0 = !{i32 0, !100}
+!100 = !{!"nvvm.l1_eviction", !"first"}
-!1 = !{!101}
-!101 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last"}
+!1 = !{i32 0, !101}
+!101 = !{!"nvvm.l1_eviction", !"last"}
-!2 = !{!102}
-!102 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"unchanged"}
+!2 = !{i32 0, !102}
+!102 = !{!"nvvm.l1_eviction", !"unchanged"}
-!3 = !{!103}
-!103 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"no_allocate"}
+!3 = !{i32 0, !103}
+!103 = !{!"nvvm.l1_eviction", !"no_allocate"}
; L2 eviction policies
-!4 = !{!104}
-!104 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"first"}
+!4 = !{i32 0, !104}
+!104 = !{!"nvvm.l2_eviction", !"first"}
-!5 = !{!105}
-!105 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"last"}
+!5 = !{i32 0, !105}
+!105 = !{!"nvvm.l2_eviction", !"last"}
; L2 prefetch sizes
-!6 = !{!106}
-!106 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"64B"}
+!6 = !{i32 0, !106}
+!106 = !{!"nvvm.l2_prefetch_size", !"64B"}
-!7 = !{!107}
-!107 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"128B"}
+!7 = !{i32 0, !107}
+!107 = !{!"nvvm.l2_prefetch_size", !"128B"}
-!8 = !{!108}
-!108 = !{!"operand_no", i32 0, !"nvvm.l2_prefetch_size", !"256B"}
+!8 = !{i32 0, !108}
+!108 = !{!"nvvm.l2_prefetch_size", !"256B"}
-; Invalid operand_no (should be ignored for load which has operand 0)
-!11 = !{!111}
-!111 = !{!"operand_no", i32 5, !"nvvm.l1_eviction", !"first"}
+; Empty hint node (should not emit any qualifier)
+!11 = !{i32 0, !111}
+!111 = !{}
; Unknown key (should be ignored, but valid L1 hint should still work)
-!12 = !{!112}
-!112 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first", !"nvvm.unknown_key", !"value"}
+!12 = !{i32 0, !112}
+!112 = !{!"nvvm.l1_eviction", !"first", !"nvvm.unknown_key", !"value"}
; "normal" eviction (default, should not emit qualifier)
-!13 = !{!113}
-!113 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"normal"}
+!13 = !{i32 0, !113}
+!113 = !{!"nvvm.l1_eviction", !"normal"}
-!14 = !{!114}
-!114 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"normal"}
+!14 = !{i32 0, !114}
+!114 = !{!"nvvm.l2_eviction", !"normal"}
; All L1 + L2 combinations
-!20 = !{!120}
-!120 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first"}
+!20 = !{i32 0, !120}
+!120 = !{!"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first"}
-!21 = !{!121}
-!121 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last"}
+!21 = !{i32 0, !121}
+!121 = !{!"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last"}
-!22 = !{!122}
-!122 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first"}
+!22 = !{i32 0, !122}
+!122 = !{!"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first"}
-!23 = !{!123}
-!123 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"last"}
+!23 = !{i32 0, !123}
+!123 = !{!"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"last"}
; L1 + L2 + Prefetch combination
-!24 = !{!124}
-!124 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"128B"}
+!24 = !{i32 0, !124}
+!124 = !{!"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"128B"}
; L2::cache_hint with constant cache-policy
-!30 = !{!130}
-!130 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 12345}
+!30 = !{i32 0, !130}
+!130 = !{!"nvvm.l2_cache_hint", i64 12345}
-!31 = !{!131}
-!131 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 98765}
+!31 = !{i32 0, !131}
+!131 = !{!"nvvm.l2_cache_hint", i64 98765}
-!32 = !{!132}
-!132 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 55555}
+!32 = !{i32 0, !132}
+!132 = !{!"nvvm.l2_cache_hint", i64 55555}
-!33 = !{!133}
-!133 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 67890}
+!33 = !{i32 0, !133}
+!133 = !{!"nvvm.l2_cache_hint", i64 67890}
-!34 = !{!134}
-!134 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 11111}
+!34 = !{i32 0, !134}
+!134 = !{!"nvvm.l2_cache_hint", i64 11111}
-!35 = !{!135}
-!135 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22222}
+!35 = !{i32 0, !135}
+!135 = !{!"nvvm.l2_cache_hint", i64 22222}
; L2::cache_hint for vector types
-!40 = !{!140}
-!140 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 33333}
+!40 = !{i32 0, !140}
+!140 = !{!"nvvm.l2_cache_hint", i64 33333}
-!41 = !{!141}
-!141 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 44444}
+!41 = !{i32 0, !141}
+!141 = !{!"nvvm.l2_cache_hint", i64 44444}
-!42 = !{!142}
-!142 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 55556}
+!42 = !{i32 0, !142}
+!142 = !{!"nvvm.l2_cache_hint", i64 55556}
-!43 = !{!143}
-!143 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 66666}
+!43 = !{i32 0, !143}
+!143 = !{!"nvvm.l2_cache_hint", i64 66666}
-!44 = !{!144}
-!144 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 77777}
+!44 = !{i32 0, !144}
+!144 = !{!"nvvm.l2_cache_hint", i64 77777}
-!45 = !{!145}
-!145 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 88888}
+!45 = !{i32 0, !145}
+!145 = !{!"nvvm.l2_cache_hint", i64 88888}
-!46 = !{!146}
-!146 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 99999}
+!46 = !{i32 0, !146}
+!146 = !{!"nvvm.l2_cache_hint", i64 99999}
-!47 = !{!147}
-!147 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 11112}
+!47 = !{i32 0, !147}
+!147 = !{!"nvvm.l2_cache_hint", i64 11112}
-!48 = !{!148}
-!148 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22223}
+!48 = !{i32 0, !148}
+!148 = !{!"nvvm.l2_cache_hint", i64 22223}
-!49 = !{!149}
-!149 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 33334}
+!49 = !{i32 0, !149}
+!149 = !{!"nvvm.l2_cache_hint", i64 33334}
; L2::cache_hint combined with other hints (L2::cache_hint takes precedence)
-!50 = !{!150}
-!150 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 44445, !"nvvm.l1_eviction", !"first"}
+!50 = !{i32 0, !150}
+!150 = !{!"nvvm.l2_cache_hint", i64 44445, !"nvvm.l1_eviction", !"first"}
-!51 = !{!151}
-!151 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 55557, !"nvvm.l2_eviction", !"last"}
+!51 = !{i32 0, !151}
+!151 = !{!"nvvm.l2_cache_hint", i64 55557, !"nvvm.l2_eviction", !"last"}
-!52 = !{!152}
-!152 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 66667, !"nvvm.l2_prefetch_size", !"128B"}
+!52 = !{i32 0, !152}
+!152 = !{!"nvvm.l2_cache_hint", i64 66667, !"nvvm.l2_prefetch_size", !"128B"}
-!53 = !{!153}
-!153 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 77778, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"256B"}
+!53 = !{i32 0, !153}
+!153 = !{!"nvvm.l2_cache_hint", i64 77778, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"256B"}
-!54 = !{!154}
-!154 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 88889, !"nvvm.l1_eviction", !"unchanged"}
+!54 = !{i32 0, !154}
+!154 = !{!"nvvm.l2_cache_hint", i64 88889, !"nvvm.l1_eviction", !"unchanged"}
-!55 = !{!155}
-!155 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 99990, !"nvvm.l1_eviction", !"no_allocate", !"nvvm.l2_eviction", !"last"}
+!55 = !{i32 0, !155}
+!155 = !{!"nvvm.l2_cache_hint", i64 99990, !"nvvm.l1_eviction", !"no_allocate", !"nvvm.l2_eviction", !"last"}
-!56 = !{!156}
-!156 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 11113, !"nvvm.l1_eviction", !"first"}
+!56 = !{i32 0, !156}
+!156 = !{!"nvvm.l2_cache_hint", i64 11113, !"nvvm.l1_eviction", !"first"}
-!57 = !{!157}
-!157 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22224, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"64B"}
+!57 = !{i32 0, !157}
+!157 = !{!"nvvm.l2_cache_hint", i64 22224, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"64B"}
; Multiple loads same pointer test (different policies)
-!60 = !{!160}
-!160 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 11111, !"nvvm.l1_eviction", !"last"}
+!60 = !{i32 0, !160}
+!160 = !{!"nvvm.l2_cache_hint", i64 11111, !"nvvm.l1_eviction", !"last"}
-!61 = !{!161}
-!161 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22222, !"nvvm.l1_eviction", !"first"}
+!61 = !{i32 0, !161}
+!161 = !{!"nvvm.l2_cache_hint", i64 22222, !"nvvm.l1_eviction", !"first"}
-; Reordered metadata test (operand_no not first)
-!70 = !{!170}
-!170 = !{!"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first", !"operand_no", i32 0}
+; Custom key order in hint node
+!70 = !{i32 0, !170}
+!170 = !{!"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first"}
; Invalid nvvm.l2_cache_hint values - should be ignored, no L2::cache_hint emitted
; String value instead of i64 - invalid, L1 hint should still work
-!80 = !{!180}
-!180 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first", !"nvvm.l2_cache_hint", !"not_a_number"}
+!80 = !{i32 0, !180}
+!180 = !{!"nvvm.l1_eviction", !"first", !"nvvm.l2_cache_hint", !"not_a_number"}
; Null/metadata reference instead of constant - invalid
-!81 = !{!181}
-!181 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last", !"nvvm.l2_cache_hint", !{}}
+!81 = !{i32 0, !181}
+!181 = !{!"nvvm.l1_eviction", !"last", !"nvvm.l2_cache_hint", !{}}
; i32 instead of i64 - still valid, ConstantInt accepts any integer type
-!82 = !{!182}
-!182 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i32 99999}
+!82 = !{i32 0, !182}
+!182 = !{!"nvvm.l2_cache_hint", i32 99999}
; Store: string value for nvvm.l2_cache_hint - invalid
-!83 = !{!183}
-!183 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"unchanged", !"nvvm.l2_cache_hint", !"invalid"}
+!83 = !{i32 0, !183}
+!183 = !{!"nvvm.l1_eviction", !"unchanged", !"nvvm.l2_cache_hint", !"invalid"}
diff --git a/llvm/test/CodeGen/NVPTX/memcpy-cache-hint.ll b/llvm/test/CodeGen/NVPTX/memcpy-cache-hint.ll
index 64958bc8a1c88..62af9fa984f28 100644
--- a/llvm/test/CodeGen/NVPTX/memcpy-cache-hint.ll
+++ b/llvm/test/CodeGen/NVPTX/memcpy-cache-hint.ll
@@ -259,85 +259,85 @@ define void @test_memcpy_all_hints_both(ptr addrspace(1) %dest, ptr addrspace(1)
;-----------------------------------------------------------------------------
; memcpy with both dest and src hints
-!80 = !{!180, !181}
+!80 = !{i32 0, !180, i32 1, !181}
; operand 0 (dest/store): L1::evict_last, L2::evict_last
-!180 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"last"}
+!180 = !{!"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"last"}
; operand 1 (src/load): L1::evict_first, L2::evict_first, L2::128B prefetch
-!181 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"128B"}
+!181 = !{!"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"128B"}
; memcpy with only source hint (load side)
-!81 = !{!182}
-!182 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first"}
+!81 = !{i32 1, !182}
+!182 = !{!"nvvm.l1_eviction", !"first"}
; memcpy with only dest hint (store side)
-!82 = !{!183}
-!183 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"last"}
+!82 = !{i32 0, !183}
+!183 = !{!"nvvm.l2_eviction", !"last"}
; memcpy with L2::cache_hint on both operands
-!83 = !{!184, !185}
-!184 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 12121}
-!185 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 34343}
+!83 = !{i32 0, !184, i32 1, !185}
+!184 = !{!"nvvm.l2_cache_hint", i64 12121}
+!185 = !{!"nvvm.l2_cache_hint", i64 34343}
; Combined L1 + L2 eviction on source only
-!84 = !{!186}
-!186 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last"}
+!84 = !{i32 1, !186}
+!186 = !{!"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last"}
; Combined L1 + L2 eviction on dest only
-!85 = !{!187}
-!187 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"unchanged", !"nvvm.l2_eviction", !"first"}
+!85 = !{i32 0, !187}
+!187 = !{!"nvvm.l1_eviction", !"unchanged", !"nvvm.l2_eviction", !"first"}
; L1 + prefetch on source, L1 on dest
-!86 = !{!188, !189}
-!188 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"last", !"nvvm.l2_prefetch_size", !"256B"}
-!189 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"no_allocate"}
+!86 = !{i32 1, !188, i32 0, !189}
+!188 = !{!"nvvm.l1_eviction", !"last", !"nvvm.l2_prefetch_size", !"256B"}
+!189 = !{!"nvvm.l1_eviction", !"no_allocate"}
; Prefetch on source, L2 eviction on dest
-!87 = !{!190, !191}
-!190 = !{!"operand_no", i32 1, !"nvvm.l2_prefetch_size", !"64B"}
-!191 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"last"}
+!87 = !{i32 1, !190, i32 0, !191}
+!190 = !{!"nvvm.l2_prefetch_size", !"64B"}
+!191 = !{!"nvvm.l2_eviction", !"last"}
; L2::cache_hint + L1 eviction on source only
-!88 = !{!192}
-!192 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 55555, !"nvvm.l1_eviction", !"first"}
+!88 = !{i32 1, !192}
+!192 = !{!"nvvm.l2_cache_hint", i64 55555, !"nvvm.l1_eviction", !"first"}
; L2::cache_hint + L1 + L2 eviction on dest only
-!89 = !{!193}
-!193 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 66666, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first"}
+!89 = !{i32 0, !193}
+!193 = !{!"nvvm.l2_cache_hint", i64 66666, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first"}
; Both operands: L2::cache_hint + L1 + L2 eviction
-!90 = !{!194, !195}
-!194 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 77777, !"nvvm.l1_eviction", !"unchanged", !"nvvm.l2_eviction", !"last"}
-!195 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 88888, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first"}
+!90 = !{i32 1, !194, i32 0, !195}
+!194 = !{!"nvvm.l2_cache_hint", i64 77777, !"nvvm.l1_eviction", !"unchanged", !"nvvm.l2_eviction", !"last"}
+!195 = !{!"nvvm.l2_cache_hint", i64 88888, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first"}
; L2::cache_hint + prefetch on source, L2::cache_hint + L1 on dest
-!91 = !{!196, !197}
-!196 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 11111, !"nvvm.l2_prefetch_size", !"128B"}
-!197 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22222, !"nvvm.l1_eviction", !"last"}
+!91 = !{i32 1, !196, i32 0, !197}
+!196 = !{!"nvvm.l2_cache_hint", i64 11111, !"nvvm.l2_prefetch_size", !"128B"}
+!197 = !{!"nvvm.l2_cache_hint", i64 22222, !"nvvm.l1_eviction", !"last"}
; Complex source (all non-cache_hint), simple dest
-!92 = !{!198, !199}
-!198 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"64B"}
-!199 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last"}
+!92 = !{i32 1, !198, i32 0, !199}
+!198 = !{!"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"64B"}
+!199 = !{!"nvvm.l1_eviction", !"last"}
; Simple source, complex dest (with cache_hint)
-!93 = !{!200, !201}
-!200 = !{!"operand_no", i32 1, !"nvvm.l2_prefetch_size", !"256B"}
-!201 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 99999, !"nvvm.l1_eviction", !"no_allocate", !"nvvm.l2_eviction", !"last"}
+!93 = !{i32 1, !200, i32 0, !201}
+!200 = !{!"nvvm.l2_prefetch_size", !"256B"}
+!201 = !{!"nvvm.l2_cache_hint", i64 99999, !"nvvm.l1_eviction", !"no_allocate", !"nvvm.l2_eviction", !"last"}
; Different L1 policies: unchanged vs first
-!94 = !{!202, !203}
-!202 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"unchanged"}
-!203 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"first"}
+!94 = !{i32 1, !202, i32 0, !203}
+!202 = !{!"nvvm.l1_eviction", !"unchanged"}
+!203 = !{!"nvvm.l1_eviction", !"first"}
; Different L1 policies: no_allocate vs unchanged
-!95 = !{!204, !205}
-!204 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"no_allocate"}
-!205 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"unchanged"}
+!95 = !{i32 1, !204, i32 0, !205}
+!204 = !{!"nvvm.l1_eviction", !"no_allocate"}
+!205 = !{!"nvvm.l1_eviction", !"unchanged"}
; All hints maxed out on both operands
-!96 = !{!206, !207}
-!206 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 12345, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"256B"}
-!207 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 67890, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"128B"}
+!96 = !{i32 1, !206, i32 0, !207}
+!206 = !{!"nvvm.l2_cache_hint", i64 12345, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"first", !"nvvm.l2_prefetch_size", !"256B"}
+!207 = !{!"nvvm.l2_cache_hint", i64 67890, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"128B"}
;-----------------------------------------------------------------------------
; Large memcpy tests - verify hints propagate to all expanded load/stores
@@ -389,18 +389,18 @@ define void @test_memcpy_128bytes_combined(ptr addrspace(1) %dest, ptr addrspace
}
; Large memcpy metadata
-!97 = !{!208, !209}
-!208 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first"}
-!209 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last"}
+!97 = !{i32 1, !208, i32 0, !209}
+!208 = !{!"nvvm.l1_eviction", !"first"}
+!209 = !{!"nvvm.l1_eviction", !"last"}
-!98 = !{!210, !211}
-!210 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"unchanged"}
-!211 = !{!"operand_no", i32 0, !"nvvm.l2_eviction", !"first"}
+!98 = !{i32 1, !210, i32 0, !211}
+!210 = !{!"nvvm.l1_eviction", !"unchanged"}
+!211 = !{!"nvvm.l2_eviction", !"first"}
-!99 = !{!212, !213}
-!212 = !{!"operand_no", i32 1, !"nvvm.l2_cache_hint", i64 11111}
-!213 = !{!"operand_no", i32 0, !"nvvm.l2_cache_hint", i64 22222}
+!99 = !{i32 1, !212, i32 0, !213}
+!212 = !{!"nvvm.l2_cache_hint", i64 11111}
+!213 = !{!"nvvm.l2_cache_hint", i64 22222}
-!100 = !{!214, !215}
-!214 = !{!"operand_no", i32 1, !"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"256B"}
-!215 = !{!"operand_no", i32 0, !"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first"}
+!100 = !{i32 1, !214, i32 0, !215}
+!214 = !{!"nvvm.l1_eviction", !"first", !"nvvm.l2_eviction", !"last", !"nvvm.l2_prefetch_size", !"256B"}
+!215 = !{!"nvvm.l1_eviction", !"last", !"nvvm.l2_eviction", !"first"}
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_2.mir b/llvm/test/DebugInfo/NVPTX/inlinedAt_2.mir
index a6486af916864..ea62f7c7c8321 100644
--- a/llvm/test/DebugInfo/NVPTX/inlinedAt_2.mir
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_2.mir
@@ -113,7 +113,7 @@ body: |
bb.0.entry:
successors: %bb.1(0x40000000), %bb.2(0x40000000)
- %1:b32 = LD_i32 0, 0, 1, 3, 32, -1, @gg, 0, debug-location !6 :: (dereferenceable load (s32) from @gg, addrspace 1); t2.cu:9:3 @[ t2.cu:18:3 ]
+ %1:b32 = LD_i32 0, 0, 1, 3, 32, -1, 0, @gg, 0, 0, debug-location !6 :: (dereferenceable load (s32) from @gg, addrspace 1); t2.cu:9:3 @[ t2.cu:18:3 ]
%2:b1 = SETP_i32ri %1, 8, 2, debug-location !6; t2.cu:9:3 @[ t2.cu:18:3 ]
CBranch %2, %bb.2, debug-location !6; t2.cu:9:3 @[ t2.cu:18:3 ]
@@ -122,7 +122,7 @@ body: |
successors: %bb.2(0x80000000)
%0:b32 = nuw nsw ADD32ri %1, 1
- ST_i32 %0, 0, 0, 1, 32, @gg, 0, debug-location !11 :: (store (s32) into @gg, addrspace 1); t2.cu:14:3 @[ t2.cu:10:5 @[ t2.cu:18:3 ] ]
+ ST_i32 %0, 0, 0, 1, 32, 0, @gg, 0, 0, debug-location !11 :: (store (s32) into @gg, addrspace 1); t2.cu:14:3 @[ t2.cu:10:5 @[ t2.cu:18:3 ] ]
bb.2._Z3foov.exit:
diff --git a/llvm/test/Verifier/mem-cache-hint.ll b/llvm/test/Verifier/mem-cache-hint.ll
index 81b27a12fb2bd..5b6af20682b20 100644
--- a/llvm/test/Verifier/mem-cache-hint.ll
+++ b/llvm/test/Verifier/mem-cache-hint.ll
@@ -1,37 +1,68 @@
; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
+declare void @foo(i32, i32)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias writeonly, ptr noalias readonly, i64, i1 immarg)
+
; CHECK: !mem.cache_hint is only valid on memory operations
define void @non_memory_op(i32 %x, i32 %y) {
- %z = add i32 %x, %y, !mem.cache_hint !{!{!"operand_no", i32 0}}
+ %z = add i32 %x, %y, !mem.cache_hint !{i32 0, !{!"nvvm.l1_eviction", !"first"}}
+ ret void
+}
+
+; CHECK: !mem.cache_hint must have even number of operands
+define void @odd_top_level_operands(ptr %p) {
+ %v = load i32, ptr %p, !mem.cache_hint !{i32 0}
+ ret void
+}
+
+; CHECK: !mem.cache_hint operand_no must be an integer constant in pair
+define void @operand_no_not_integer(ptr %p) {
+ %v = load i32, ptr %p, !mem.cache_hint !{!"zero", !{!"nvvm.l1_eviction", !"first"}}
ret void
}
-; CHECK: !mem.cache_hint operand must not be null
-define void @null_operand(ptr %p) {
- %v = load i32, ptr %p, !mem.cache_hint !{null}
+; CHECK: !mem.cache_hint operand_no must refer to a valid memory object operand
+define void @operand_no_not_pointer(i32 %x, i32 %y) {
+ call void @foo(i32 %x, i32 %y), !mem.cache_hint !{i32 0, !{!"nvvm.l1_eviction", !"first"}}
ret void
}
-; CHECK: !mem.cache_hint operand must be a metadata node
-define void @operand_not_mdnode(ptr %p) {
- %v = load i32, ptr %p, !mem.cache_hint !{!"not_a_node"}
+; CHECK: !mem.cache_hint operand_no must refer to a valid memory object operand
+define void @operand_no_out_of_range(ptr %p) {
+ %v = load i32, ptr %p, !mem.cache_hint !{i32 1, !{!"nvvm.l1_eviction", !"first"}}
ret void
}
-; CHECK: !mem.cache_hint node must have even number of operands
-define void @odd_operands(ptr %p) {
- %v = load i32, ptr %p, !mem.cache_hint !{!{!"operand_no", i32 0, !"extra"}}
+; CHECK: !mem.cache_hint contains duplicate operand_no
+define void @duplicate_operand_no(ptr %p) {
+ call void @llvm.memcpy.p0.p0.i64(ptr %p, ptr %p, i64 8, i1 false), !mem.cache_hint !{
+ i32 0, !{!"nvvm.l1_eviction", !"first"},
+ i32 0, !{!"nvvm.l1_eviction", !"last"}}
+ ret void
+}
+
+; CHECK: !mem.cache_hint hint node must be a metadata node
+define void @hint_node_not_mdnode(ptr %p) {
+ %v = load i32, ptr %p, !mem.cache_hint !{i32 0, !"not_a_node"}
+ ret void
+}
+
+; CHECK: !mem.cache_hint hint node must have even number of operands
+define void @hint_node_odd_operands(ptr %p) {
+ %v = load i32, ptr %p, !mem.cache_hint !{i32 0, !{!"nvvm.l1_eviction"}}
ret void
}
; CHECK: !mem.cache_hint key must be a string
define void @key_not_string(ptr %p) {
- %v = load i32, ptr %p, !mem.cache_hint !{!{i32 0, i32 1}}
+ %v = load i32, ptr %p, !mem.cache_hint !{i32 0, !{i32 0, !"first"}}
ret void
}
-; CHECK: !mem.cache_hint 'operand_no' must be an integer constant
-define void @operand_no_not_integer(ptr %p) {
- %v = load i32, ptr %p, !mem.cache_hint !{!{!"operand_no", !"zero"}}
+; CHECK: !mem.cache_hint hint node contains duplicate key
+define void @duplicate_key(ptr %p) {
+ %v = load i32, ptr %p, !mem.cache_hint !{i32 0, !{
+ !"nvvm.l1_eviction", !"first",
+ !"nvvm.l1_eviction", !"last"}}
ret void
}
More information about the llvm-commits
mailing list