[llvm] [NVPTX] Pull invariant load identification into IR pass (PR #138015)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Thu May 1 08:14:46 PDT 2025
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/138015
>From af03c15bc770c2f107f331ac94fa23a6f88fa7ff Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Thu, 24 Apr 2025 21:34:58 +0000
Subject: [PATCH 1/3] [NVPTX] Pull invariant load identification into IR pass
---
llvm/lib/Target/NVPTX/CMakeLists.txt | 15 +-
llvm/lib/Target/NVPTX/NVPTX.h | 6 +
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 52 +------
llvm/lib/Target/NVPTX/NVPTXPassRegistry.def | 3 +-
.../Target/NVPTX/NVPTXTagInvariantLoads.cpp | 104 +++++++++++++
llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp | 2 +
.../test/CodeGen/NVPTX/tag-invariant-loads.ll | 138 ++++++++++++++++++
7 files changed, 268 insertions(+), 52 deletions(-)
create mode 100644 llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp
create mode 100644 llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index 1cffde138eab7..693f0d0b35edc 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -13,34 +13,35 @@ add_public_tablegen_target(NVPTXCommonTableGen)
set(NVPTXCodeGen_sources
NVPTXAliasAnalysis.cpp
NVPTXAllocaHoisting.cpp
- NVPTXAtomicLower.cpp
NVPTXAsmPrinter.cpp
NVPTXAssignValidGlobalNames.cpp
+ NVPTXAtomicLower.cpp
+ NVPTXCtorDtorLowering.cpp
NVPTXForwardParams.cpp
NVPTXFrameLowering.cpp
NVPTXGenericToNVVM.cpp
- NVPTXISelDAGToDAG.cpp
- NVPTXISelLowering.cpp
NVPTXImageOptimizer.cpp
NVPTXInstrInfo.cpp
+ NVPTXISelDAGToDAG.cpp
+ NVPTXISelLowering.cpp
NVPTXLowerAggrCopies.cpp
- NVPTXLowerArgs.cpp
NVPTXLowerAlloca.cpp
+ NVPTXLowerArgs.cpp
NVPTXLowerUnreachable.cpp
- NVPTXPeephole.cpp
NVPTXMCExpr.cpp
+ NVPTXPeephole.cpp
NVPTXPrologEpilogPass.cpp
+ NVPTXProxyRegErasure.cpp
NVPTXRegisterInfo.cpp
NVPTXReplaceImageHandles.cpp
NVPTXSelectionDAGInfo.cpp
NVPTXSubtarget.cpp
+ NVPTXTagInvariantLoads.cpp
NVPTXTargetMachine.cpp
NVPTXTargetTransformInfo.cpp
NVPTXUtilities.cpp
NVVMIntrRange.cpp
NVVMReflect.cpp
- NVPTXProxyRegErasure.cpp
- NVPTXCtorDtorLowering.cpp
)
add_llvm_target(NVPTXCodeGen
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index cf21ad991ccdf..1da979d023b42 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -51,6 +51,7 @@ FunctionPass *createNVPTXLowerArgsPass();
FunctionPass *createNVPTXLowerAllocaPass();
FunctionPass *createNVPTXLowerUnreachablePass(bool TrapUnreachable,
bool NoTrapAfterNoreturn);
+FunctionPass *createNVPTXTagInvariantLoadsPass();
MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
MachineFunctionPass *createNVPTXForwardParamsPass();
@@ -73,6 +74,7 @@ void initializeNVVMReflectPass(PassRegistry &);
void initializeNVPTXAAWrapperPassPass(PassRegistry &);
void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
void initializeNVPTXPeepholePass(PassRegistry &);
+void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &);
struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
@@ -104,6 +106,10 @@ struct NVPTXLowerArgsPass : PassInfoMixin<NVPTXLowerArgsPass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
+struct NVPTXTagInvariantLoadsPass : PassInfoMixin<NVPTXTagInvariantLoadsPass> {
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
namespace NVPTX {
enum DrvInterface {
NVCL,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 295ed666a1902..0eb350bc4cc97 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -766,46 +766,12 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
llvm_unreachable("unhandled ordering");
}
-static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
- unsigned CodeAddrSpace, MachineFunction *F) {
+static bool canLowerToLDG(const MemSDNode *N, const NVPTXSubtarget &Subtarget,
+ unsigned CodeAddrSpace) {
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
// space.
- //
- // We have two ways of identifying invariant loads: Loads may be explicitly
- // marked as invariant, or we may infer them to be invariant.
- //
- // We currently infer invariance for loads from
- // - constant global variables, and
- // - kernel function pointer params that are noalias (i.e. __restrict) and
- // never written to.
- //
- // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
- // not during the SelectionDAG phase).
- //
- // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
- // explicitly invariant loads because these are how clang tells us to use ldg
- // when the user uses a builtin.
- if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::AddressSpace::Global)
- return false;
-
- if (N->isInvariant())
- return true;
-
- bool IsKernelFn = isKernelFunction(F->getFunction());
-
- // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
- // because the former looks through phi nodes while the latter does not. We
- // need to look through phi nodes to handle pointer induction variables.
- SmallVector<const Value *, 8> Objs;
- getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
-
- return all_of(Objs, [&](const Value *V) {
- if (auto *A = dyn_cast<const Argument>(V))
- return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
- if (auto *GV = dyn_cast<const GlobalVariable>(V))
- return GV->isConstant();
- return false;
- });
+ return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
+ N->isInvariant();
}
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
@@ -1106,10 +1072,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
return false;
// Address Space Setting
- unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
- if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
+ const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
+ if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace))
return tryLDGLDU(N);
- }
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
@@ -1192,10 +1157,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
const MVT MemVT = MemEVT.getSimpleVT();
// Address Space Setting
- unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
- if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
+ const unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
+ if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace))
return tryLDGLDU(N);
- }
EVT EltVT = N->getValueType(0);
SDLoc DL(N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
index 1c813c2c51f70..ee37c9826012c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
+++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
@@ -38,5 +38,6 @@ FUNCTION_ALIAS_ANALYSIS("nvptx-aa", NVPTXAA())
#endif
FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
-FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this));
+FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this))
+FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass())
#undef FUNCTION_PASS
diff --git a/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp b/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp
new file mode 100644
index 0000000000000..92da23bec8073
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp
@@ -0,0 +1,104 @@
+//===------ NVPTXTagInvariantLoads.cpp - Tag invariant loads --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements invaraint load tagging. It traverses load instructions
+// in a function, and determines if each load can be tagged as invariant.
+//
+// We currently infer invariance for loads from
+// - constant global variables, and
+// - kernel function pointer params that are noalias (i.e. __restrict) and
+// never written to.
+//
+// TODO: Perform a more powerful invariance analysis (ideally IPO).
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/NVPTXAddrSpace.h"
+
+using namespace llvm;
+
+static void markLoadsAsInvariant(LoadInst *LI) {
+ LI->setMetadata(LLVMContext::MD_invariant_load,
+ MDNode::get(LI->getContext(), {}));
+}
+
+static bool tagInvariantLoads(Function &F) {
+ const bool IsKernelFn = isKernelFunction(F);
+
+ bool Changed = false;
+ for (auto &I : instructions(F)) {
+ if (auto *LI = dyn_cast<LoadInst>(&I)) {
+
+ // Don't bother with non-global loads
+ if (LI->getPointerAddressSpace() != NVPTXAS::ADDRESS_SPACE_GLOBAL)
+ continue;
+
+ if (LI->getMetadata(LLVMContext::MD_invariant_load))
+ continue;
+
+ SmallVector<const Value *, 8> Objs;
+
+ // We use getUnderlyingObjects() here instead of getUnderlyingObject()
+ // mainly because the former looks through phi nodes while the latter does
+ // not. We need to look through phi nodes to handle pointer induction
+ // variables.
+
+ getUnderlyingObjects(LI->getPointerOperand(), Objs);
+
+ const bool IsInvariant = all_of(Objs, [&](const Value *V) {
+ if (const auto *A = dyn_cast<const Argument>(V))
+ return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
+ if (const auto *GV = dyn_cast<const GlobalVariable>(V))
+ return GV->isConstant();
+ return false;
+ });
+
+ if (IsInvariant) {
+ markLoadsAsInvariant(LI);
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+namespace {
+
+struct NVPTXTagInvariantLoadLegacyPass : public FunctionPass {
+ static char ID;
+
+ NVPTXTagInvariantLoadLegacyPass() : FunctionPass(ID) {}
+ bool runOnFunction(Function &F) override;
+};
+
+} // namespace
+
+INITIALIZE_PASS(NVPTXTagInvariantLoadLegacyPass, "nvptx-tag-invariant-loads",
+ "NVPTX Tag Invariant Loads", false, false)
+
+bool NVPTXTagInvariantLoadLegacyPass::runOnFunction(Function &F) {
+ return tagInvariantLoads(F);
+}
+
+char NVPTXTagInvariantLoadLegacyPass::ID = 0;
+
+FunctionPass *llvm::createNVPTXTagInvariantLoadsPass() {
+ return new NVPTXTagInvariantLoadLegacyPass();
+}
+
+PreservedAnalyses NVPTXTagInvariantLoadsPass::run(Function &F,
+ FunctionAnalysisManager &) {
+ return tagInvariantLoads(F) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index f78d4585bbe98..dc3afc1f4a17d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -112,6 +112,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
initializeNVPTXAAWrapperPassPass(PR);
initializeNVPTXExternalAAWrapperPass(PR);
initializeNVPTXPeepholePass(PR);
+ initializeNVPTXTagInvariantLoadLegacyPassPass(PR);
}
static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
@@ -395,6 +396,7 @@ void NVPTXPassConfig::addIRPasses() {
if (!DisableLoadStoreVectorizer)
addPass(createLoadStoreVectorizerPass());
addPass(createSROAPass());
+ addPass(createNVPTXTagInvariantLoadsPass());
}
if (ST.hasPTXASUnreachableBug()) {
diff --git a/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll b/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll
new file mode 100644
index 0000000000000..26967faa01a1b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=nvptx-tag-invariant-loads < %s -mcpu=sm_80 | FileCheck %s --check-prefix=OPT
+; RUN: llc -o - < %s -mcpu=sm_80 | FileCheck %s --check-prefix=PTX
+
+target triple = "nvptx-unknown-cuda"
+
+define ptx_kernel void @basic(ptr noalias readonly %a, ptr %out) {
+; OPT-LABEL: define ptx_kernel void @basic(
+; OPT-SAME: ptr noalias readonly [[A:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT: [[A_GLOBAL:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; OPT-NEXT: [[VAL:%.*]] = load float, ptr addrspace(1) [[A_GLOBAL]], align 4, !invariant.load [[META0:![0-9]+]]
+; OPT-NEXT: store float [[VAL]], ptr [[OUT]], align 4
+; OPT-NEXT: ret void
+;
+; PTX-LABEL: basic(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<5>;
+; PTX-NEXT: .reg .b32 %f<2>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u32 %r1, [basic_param_0];
+; PTX-NEXT: cvta.to.global.u32 %r2, %r1;
+; PTX-NEXT: ld.param.u32 %r3, [basic_param_1];
+; PTX-NEXT: cvta.to.global.u32 %r4, %r3;
+; PTX-NEXT: ld.global.nc.f32 %f1, [%r2];
+; PTX-NEXT: st.global.f32 [%r4], %f1;
+; PTX-NEXT: ret;
+ %a_global = addrspacecast ptr %a to ptr addrspace(1)
+ %val = load float, ptr addrspace(1) %a_global
+ store float %val, ptr %out
+ ret void
+}
+
+define ptx_kernel void @select(ptr noalias readonly %a, ptr noalias readonly %b, i1 %c, ptr %out) {
+; OPT-LABEL: define ptx_kernel void @select(
+; OPT-SAME: ptr noalias readonly [[A:%.*]], ptr noalias readonly [[B:%.*]], i1 [[C:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[SELECT:%.*]] = select i1 [[C]], ptr [[A]], ptr [[B]]
+; OPT-NEXT: [[SELECT_GLOBAL:%.*]] = addrspacecast ptr [[SELECT]] to ptr addrspace(1)
+; OPT-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[SELECT_GLOBAL]], align 4, !invariant.load [[META0]]
+; OPT-NEXT: store i32 [[VAL]], ptr [[OUT]], align 4
+; OPT-NEXT: ret void
+;
+; PTX-LABEL: select(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b16 %rs<3>;
+; PTX-NEXT: .reg .b32 %r<9>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u8 %rs1, [select_param_2];
+; PTX-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; PTX-NEXT: ld.param.u32 %r1, [select_param_0];
+; PTX-NEXT: cvta.to.global.u32 %r2, %r1;
+; PTX-NEXT: ld.param.u32 %r3, [select_param_1];
+; PTX-NEXT: cvta.to.global.u32 %r4, %r3;
+; PTX-NEXT: ld.param.u32 %r5, [select_param_3];
+; PTX-NEXT: cvta.to.global.u32 %r6, %r5;
+; PTX-NEXT: selp.b32 %r7, %r2, %r4, %p1;
+; PTX-NEXT: ld.global.nc.u32 %r8, [%r7];
+; PTX-NEXT: st.global.u32 [%r6], %r8;
+; PTX-NEXT: ret;
+ %select = select i1 %c, ptr %a, ptr %b
+ %select_global = addrspacecast ptr %select to ptr addrspace(1)
+ %val = load i32, ptr addrspace(1) %select_global
+ store i32 %val, ptr %out
+ ret void
+}
+
+define void @not_kernel(ptr noalias readonly %a, ptr %out) {
+; OPT-LABEL: define void @not_kernel(
+; OPT-SAME: ptr noalias readonly [[A:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[A_GLOBAL:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; OPT-NEXT: [[VAL:%.*]] = load float, ptr addrspace(1) [[A_GLOBAL]], align 4
+; OPT-NEXT: store float [[VAL]], ptr [[OUT]], align 4
+; OPT-NEXT: ret void
+;
+; PTX-LABEL: not_kernel(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b32 %f<2>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u32 %r1, [not_kernel_param_0];
+; PTX-NEXT: cvta.to.global.u32 %r2, %r1;
+; PTX-NEXT: ld.param.u32 %r3, [not_kernel_param_1];
+; PTX-NEXT: ld.global.f32 %f1, [%r2];
+; PTX-NEXT: st.f32 [%r3], %f1;
+; PTX-NEXT: ret;
+ %a_global = addrspacecast ptr %a to ptr addrspace(1)
+ %val = load float, ptr addrspace(1) %a_global
+ store float %val, ptr %out
+ ret void
+}
+
+%struct.S2 = type { i64, i64 }
+ at G = private unnamed_addr constant %struct.S2 { i64 1, i64 1 }, align 8
+
+define ptx_kernel void @global_load(ptr noalias readonly %a, i1 %c, ptr %out) {
+; OPT-LABEL: define ptx_kernel void @global_load(
+; OPT-SAME: ptr noalias readonly [[A:%.*]], i1 [[C:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[G_GLOBAL:%.*]] = addrspacecast ptr @G to ptr addrspace(1)
+; OPT-NEXT: [[A_GLOBAL:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; OPT-NEXT: [[SELECT:%.*]] = select i1 [[C]], ptr addrspace(1) [[G_GLOBAL]], ptr addrspace(1) [[A_GLOBAL]]
+; OPT-NEXT: [[VAL:%.*]] = load i64, ptr addrspace(1) [[SELECT]], align 8, !invariant.load [[META0]]
+; OPT-NEXT: store i64 [[VAL]], ptr [[OUT]], align 8
+; OPT-NEXT: ret void
+;
+; PTX-LABEL: global_load(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b16 %rs<3>;
+; PTX-NEXT: .reg .b32 %r<7>;
+; PTX-NEXT: .reg .b64 %rd<2>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u8 %rs1, [global_load_param_1];
+; PTX-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; PTX-NEXT: ld.param.u32 %r1, [global_load_param_0];
+; PTX-NEXT: cvta.to.global.u32 %r2, %r1;
+; PTX-NEXT: ld.param.u32 %r3, [global_load_param_2];
+; PTX-NEXT: cvta.to.global.u32 %r4, %r3;
+; PTX-NEXT: mov.b32 %r5, G;
+; PTX-NEXT: selp.b32 %r6, %r5, %r2, %p1;
+; PTX-NEXT: ld.global.nc.u64 %rd1, [%r6];
+; PTX-NEXT: st.global.u64 [%r4], %rd1;
+; PTX-NEXT: ret;
+ %g_global = addrspacecast ptr @G to ptr addrspace(1)
+ %a_global = addrspacecast ptr %a to ptr addrspace(1)
+ %select = select i1 %c, ptr addrspace(1) %g_global, ptr addrspace(1) %a_global
+ %val = load i64, ptr addrspace(1) %select
+ store i64 %val, ptr %out
+ ret void
+}
+;.
+; OPT: [[META0]] = !{}
+;.
>From 7fd4f03df72fc59f1f3fbbc26216e7e988c9e5fd Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Thu, 1 May 2025 00:06:32 +0000
Subject: [PATCH 2/3] address comments
---
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 8 +--
.../Target/NVPTX/NVPTXTagInvariantLoads.cpp | 54 +++++++++----------
2 files changed, 31 insertions(+), 31 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 0eb350bc4cc97..7d83f8833075c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -766,12 +766,12 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
llvm_unreachable("unhandled ordering");
}
-static bool canLowerToLDG(const MemSDNode *N, const NVPTXSubtarget &Subtarget,
+static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
unsigned CodeAddrSpace) {
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
// space.
return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
- N->isInvariant();
+ N.isInvariant();
}
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
@@ -1073,7 +1073,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
// Address Space Setting
const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
- if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace))
+ if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
return tryLDGLDU(N);
SDLoc DL(N);
@@ -1158,7 +1158,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
// Address Space Setting
const unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
- if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace))
+ if (canLowerToLDG(*MemSD, *Subtarget, CodeAddrSpace))
return tryLDGLDU(N);
EVT EltVT = N->getValueType(0);
diff --git a/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp b/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp
index 92da23bec8073..a4aff44ac04f6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp
@@ -27,6 +27,32 @@
using namespace llvm;
+static bool isInvariantLoad(const LoadInst *LI, const bool IsKernelFn) {
+ // Don't bother with non-global loads
+ if (LI->getPointerAddressSpace() != NVPTXAS::ADDRESS_SPACE_GLOBAL)
+ return false;
+
+ // If the load is already marked as invariant, we don't need to do anything
+ if (LI->getMetadata(LLVMContext::MD_invariant_load))
+ return false;
+
+ // We use getUnderlyingObjects() here instead of getUnderlyingObject()
+ // mainly because the former looks through phi nodes while the latter does
+ // not. We need to look through phi nodes to handle pointer induction
+ // variables.
+ SmallVector<const Value *, 8> Objs;
+ getUnderlyingObjects(LI->getPointerOperand(), Objs);
+
+ return all_of(Objs, [&](const Value *V) {
+ if (const auto *A = dyn_cast<const Argument>(V))
+ return IsKernelFn && ((A->onlyReadsMemory() && A->hasNoAliasAttr()) ||
+ isParamGridConstant(*A));
+ if (const auto *GV = dyn_cast<const GlobalVariable>(V))
+ return GV->isConstant();
+ return false;
+ });
+}
+
static void markLoadsAsInvariant(LoadInst *LI) {
LI->setMetadata(LLVMContext::MD_invariant_load,
MDNode::get(LI->getContext(), {}));
@@ -38,38 +64,12 @@ static bool tagInvariantLoads(Function &F) {
bool Changed = false;
for (auto &I : instructions(F)) {
if (auto *LI = dyn_cast<LoadInst>(&I)) {
-
- // Don't bother with non-global loads
- if (LI->getPointerAddressSpace() != NVPTXAS::ADDRESS_SPACE_GLOBAL)
- continue;
-
- if (LI->getMetadata(LLVMContext::MD_invariant_load))
- continue;
-
- SmallVector<const Value *, 8> Objs;
-
- // We use getUnderlyingObjects() here instead of getUnderlyingObject()
- // mainly because the former looks through phi nodes while the latter does
- // not. We need to look through phi nodes to handle pointer induction
- // variables.
-
- getUnderlyingObjects(LI->getPointerOperand(), Objs);
-
- const bool IsInvariant = all_of(Objs, [&](const Value *V) {
- if (const auto *A = dyn_cast<const Argument>(V))
- return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
- if (const auto *GV = dyn_cast<const GlobalVariable>(V))
- return GV->isConstant();
- return false;
- });
-
- if (IsInvariant) {
+ if (isInvariantLoad(LI, IsKernelFn)) {
markLoadsAsInvariant(LI);
Changed = true;
}
}
}
-
return Changed;
}
>From 0469aae214bcc4460c08e82702084ae6c68af5b6 Mon Sep 17 00:00:00 2001
From: Alex Maclean <amaclean at nvidia.com>
Date: Thu, 1 May 2025 15:14:30 +0000
Subject: [PATCH 3/3] add test for #138138
---
llvm/test/CodeGen/NVPTX/byval-const-global.ll | 33 +++++++++++++++++++
1 file changed, 33 insertions(+)
create mode 100644 llvm/test/CodeGen/NVPTX/byval-const-global.ll
diff --git a/llvm/test/CodeGen/NVPTX/byval-const-global.ll b/llvm/test/CodeGen/NVPTX/byval-const-global.ll
new file mode 100644
index 0000000000000..3ffa65ebc419a
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/byval-const-global.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct = type { [2 x i64] }
+ at G = external constant %struct
+
+define void @foo() {
+; CHECK-LABEL: foo(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.global.u64 %rd1, [G];
+; CHECK-NEXT: ld.global.u64 %rd2, [G+8];
+; CHECK-NEXT: { // callseq 0, 0
+; CHECK-NEXT: .param .align 8 .b8 param0[16];
+; CHECK-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-NEXT: st.param.b64 [param0+8], %rd2;
+; CHECK-NEXT: call.uni
+; CHECK-NEXT: bar,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+; CHECK-NEXT: } // callseq 0
+; CHECK-NEXT: ret;
+ call void @bar(ptr byval(%struct) @G)
+ ret void
+}
+
+declare void @bar(ptr)
More information about the llvm-commits
mailing list