[llvm] a88d580 - [NVPTX] Pull invariant load identification into IR pass (#138015)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 1 10:19:49 PDT 2025
Author: Alex MacLean
Date: 2025-05-01T10:19:46-07:00
New Revision: a88d580860b88bbb02797bae95032b6eb0c4579c
URL: https://github.com/llvm/llvm-project/commit/a88d580860b88bbb02797bae95032b6eb0c4579c
DIFF: https://github.com/llvm/llvm-project/commit/a88d580860b88bbb02797bae95032b6eb0c4579c.diff
LOG: [NVPTX] Pull invariant load identification into IR pass (#138015)
Pull invariant load identification, which was previously part of
DAGToDAG ISel, into a new IR pass NVPTXTagInvariantLoads. This makes it
possible to disable this optimization at O0 and reduces the complexity
of the SelectionDAG pass. Moving this logic to an IR pass also allows
for implementing a more powerful traversal in the future.
Fixes https://github.com/llvm/llvm-project/issues/138138
Added:
llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp
llvm/test/CodeGen/NVPTX/byval-const-global.ll
llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll
Modified:
llvm/lib/Target/NVPTX/CMakeLists.txt
llvm/lib/Target/NVPTX/NVPTX.h
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index 1cffde138eab7..693f0d0b35edc 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -13,34 +13,35 @@ add_public_tablegen_target(NVPTXCommonTableGen)
set(NVPTXCodeGen_sources
NVPTXAliasAnalysis.cpp
NVPTXAllocaHoisting.cpp
- NVPTXAtomicLower.cpp
NVPTXAsmPrinter.cpp
NVPTXAssignValidGlobalNames.cpp
+ NVPTXAtomicLower.cpp
+ NVPTXCtorDtorLowering.cpp
NVPTXForwardParams.cpp
NVPTXFrameLowering.cpp
NVPTXGenericToNVVM.cpp
- NVPTXISelDAGToDAG.cpp
- NVPTXISelLowering.cpp
NVPTXImageOptimizer.cpp
NVPTXInstrInfo.cpp
+ NVPTXISelDAGToDAG.cpp
+ NVPTXISelLowering.cpp
NVPTXLowerAggrCopies.cpp
- NVPTXLowerArgs.cpp
NVPTXLowerAlloca.cpp
+ NVPTXLowerArgs.cpp
NVPTXLowerUnreachable.cpp
- NVPTXPeephole.cpp
NVPTXMCExpr.cpp
+ NVPTXPeephole.cpp
NVPTXPrologEpilogPass.cpp
+ NVPTXProxyRegErasure.cpp
NVPTXRegisterInfo.cpp
NVPTXReplaceImageHandles.cpp
NVPTXSelectionDAGInfo.cpp
NVPTXSubtarget.cpp
+ NVPTXTagInvariantLoads.cpp
NVPTXTargetMachine.cpp
NVPTXTargetTransformInfo.cpp
NVPTXUtilities.cpp
NVVMIntrRange.cpp
NVVMReflect.cpp
- NVPTXProxyRegErasure.cpp
- NVPTXCtorDtorLowering.cpp
)
add_llvm_target(NVPTXCodeGen
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index cf21ad991ccdf..1da979d023b42 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -51,6 +51,7 @@ FunctionPass *createNVPTXLowerArgsPass();
FunctionPass *createNVPTXLowerAllocaPass();
FunctionPass *createNVPTXLowerUnreachablePass(bool TrapUnreachable,
bool NoTrapAfterNoreturn);
+FunctionPass *createNVPTXTagInvariantLoadsPass();
MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
MachineFunctionPass *createNVPTXForwardParamsPass();
@@ -73,6 +74,7 @@ void initializeNVVMReflectPass(PassRegistry &);
void initializeNVPTXAAWrapperPassPass(PassRegistry &);
void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
void initializeNVPTXPeepholePass(PassRegistry &);
+void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &);
struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
@@ -104,6 +106,10 @@ struct NVPTXLowerArgsPass : PassInfoMixin<NVPTXLowerArgsPass> {
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
+struct NVPTXTagInvariantLoadsPass : PassInfoMixin<NVPTXTagInvariantLoadsPass> {
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
namespace NVPTX {
enum DrvInterface {
NVCL,
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 5c41ac261224d..6f6084b99dda2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -767,46 +767,12 @@ NVPTX::Scope NVPTXDAGToDAGISel::getOperationScope(MemSDNode *N,
llvm_unreachable("unhandled ordering");
}
-static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
- unsigned CodeAddrSpace, MachineFunction *F) {
+static bool canLowerToLDG(const MemSDNode &N, const NVPTXSubtarget &Subtarget,
+ unsigned CodeAddrSpace) {
// We use ldg (i.e. ld.global.nc) for invariant loads from the global address
// space.
- //
- // We have two ways of identifying invariant loads: Loads may be explicitly
- // marked as invariant, or we may infer them to be invariant.
- //
- // We currently infer invariance for loads from
- // - constant global variables, and
- // - kernel function pointer params that are noalias (i.e. __restrict) and
- // never written to.
- //
- // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
- // not during the SelectionDAG phase).
- //
- // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
- // explicitly invariant loads because these are how clang tells us to use ldg
- // when the user uses a builtin.
- if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::AddressSpace::Global)
- return false;
-
- if (N->isInvariant())
- return true;
-
- bool IsKernelFn = isKernelFunction(F->getFunction());
-
- // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
- // because the former looks through phi nodes while the latter does not. We
- // need to look through phi nodes to handle pointer induction variables.
- SmallVector<const Value *, 8> Objs;
- getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
-
- return all_of(Objs, [&](const Value *V) {
- if (auto *A = dyn_cast<const Argument>(V))
- return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
- if (auto *GV = dyn_cast<const GlobalVariable>(V))
- return GV->isConstant();
- return false;
- });
+ return Subtarget.hasLDG() && CodeAddrSpace == NVPTX::AddressSpace::Global &&
+ N.isInvariant();
}
static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
@@ -1107,10 +1073,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
return false;
// Address Space Setting
- unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
- if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
+ const unsigned CodeAddrSpace = getCodeAddrSpace(LD);
+ if (canLowerToLDG(*LD, *Subtarget, CodeAddrSpace))
return tryLDGLDU(N);
- }
SDLoc DL(N);
SDValue Chain = N->getOperand(0);
@@ -1196,10 +1161,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
const MVT MemVT = MemEVT.getSimpleVT();
// Address Space Setting
- unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
- if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
+ const unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
+ if (canLowerToLDG(*MemSD, *Subtarget, CodeAddrSpace))
return tryLDGLDU(N);
- }
EVT EltVT = N->getValueType(0);
SDLoc DL(N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
index 1c813c2c51f70..ee37c9826012c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
+++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
@@ -38,5 +38,6 @@ FUNCTION_ALIAS_ANALYSIS("nvptx-aa", NVPTXAA())
#endif
FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
-FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this));
+FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this))
+FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass())
#undef FUNCTION_PASS
diff --git a/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp b/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp
new file mode 100644
index 0000000000000..a4aff44ac04f6
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXTagInvariantLoads.cpp
@@ -0,0 +1,104 @@
+//===------ NVPTXTagInvariantLoads.cpp - Tag invariant loads --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements invaraint load tagging. It traverses load instructions
+// in a function, and determines if each load can be tagged as invariant.
+//
+// We currently infer invariance for loads from
+// - constant global variables, and
+// - kernel function pointer params that are noalias (i.e. __restrict) and
+// never written to.
+//
+// TODO: Perform a more powerful invariance analysis (ideally IPO).
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/NVPTXAddrSpace.h"
+
+using namespace llvm;
+
+static bool isInvariantLoad(const LoadInst *LI, const bool IsKernelFn) {
+ // Don't bother with non-global loads
+ if (LI->getPointerAddressSpace() != NVPTXAS::ADDRESS_SPACE_GLOBAL)
+ return false;
+
+ // If the load is already marked as invariant, we don't need to do anything
+ if (LI->getMetadata(LLVMContext::MD_invariant_load))
+ return false;
+
+ // We use getUnderlyingObjects() here instead of getUnderlyingObject()
+ // mainly because the former looks through phi nodes while the latter does
+ // not. We need to look through phi nodes to handle pointer induction
+ // variables.
+ SmallVector<const Value *, 8> Objs;
+ getUnderlyingObjects(LI->getPointerOperand(), Objs);
+
+ return all_of(Objs, [&](const Value *V) {
+ if (const auto *A = dyn_cast<const Argument>(V))
+ return IsKernelFn && ((A->onlyReadsMemory() && A->hasNoAliasAttr()) ||
+ isParamGridConstant(*A));
+ if (const auto *GV = dyn_cast<const GlobalVariable>(V))
+ return GV->isConstant();
+ return false;
+ });
+}
+
+static void markLoadsAsInvariant(LoadInst *LI) {
+ LI->setMetadata(LLVMContext::MD_invariant_load,
+ MDNode::get(LI->getContext(), {}));
+}
+
+static bool tagInvariantLoads(Function &F) {
+ const bool IsKernelFn = isKernelFunction(F);
+
+ bool Changed = false;
+ for (auto &I : instructions(F)) {
+ if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ if (isInvariantLoad(LI, IsKernelFn)) {
+ markLoadsAsInvariant(LI);
+ Changed = true;
+ }
+ }
+ }
+ return Changed;
+}
+
+namespace {
+
+struct NVPTXTagInvariantLoadLegacyPass : public FunctionPass {
+ static char ID;
+
+ NVPTXTagInvariantLoadLegacyPass() : FunctionPass(ID) {}
+ bool runOnFunction(Function &F) override;
+};
+
+} // namespace
+
+INITIALIZE_PASS(NVPTXTagInvariantLoadLegacyPass, "nvptx-tag-invariant-loads",
+ "NVPTX Tag Invariant Loads", false, false)
+
+bool NVPTXTagInvariantLoadLegacyPass::runOnFunction(Function &F) {
+ return tagInvariantLoads(F);
+}
+
+char NVPTXTagInvariantLoadLegacyPass::ID = 0;
+
+FunctionPass *llvm::createNVPTXTagInvariantLoadsPass() {
+ return new NVPTXTagInvariantLoadLegacyPass();
+}
+
+PreservedAnalyses NVPTXTagInvariantLoadsPass::run(Function &F,
+ FunctionAnalysisManager &) {
+ return tagInvariantLoads(F) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index f78d4585bbe98..dc3afc1f4a17d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -112,6 +112,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
initializeNVPTXAAWrapperPassPass(PR);
initializeNVPTXExternalAAWrapperPass(PR);
initializeNVPTXPeepholePass(PR);
+ initializeNVPTXTagInvariantLoadLegacyPassPass(PR);
}
static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) {
@@ -395,6 +396,7 @@ void NVPTXPassConfig::addIRPasses() {
if (!DisableLoadStoreVectorizer)
addPass(createLoadStoreVectorizerPass());
addPass(createSROAPass());
+ addPass(createNVPTXTagInvariantLoadsPass());
}
if (ST.hasPTXASUnreachableBug()) {
diff --git a/llvm/test/CodeGen/NVPTX/byval-const-global.ll b/llvm/test/CodeGen/NVPTX/byval-const-global.ll
new file mode 100644
index 0000000000000..cce317a52299c
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/byval-const-global.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_70 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct = type { [2 x i64] }
+ at G = external constant %struct
+
+define void @foo() {
+; CHECK-LABEL: foo(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.global.u64 %rd1, [G];
+; CHECK-NEXT: ld.global.u64 %rd2, [G+8];
+; CHECK-NEXT: { // callseq 0, 0
+; CHECK-NEXT: .param .align 8 .b8 param0[16];
+; CHECK-NEXT: st.param.b64 [param0], %rd1;
+; CHECK-NEXT: st.param.b64 [param0+8], %rd2;
+; CHECK-NEXT: call.uni
+; CHECK-NEXT: bar,
+; CHECK-NEXT: (
+; CHECK-NEXT: param0
+; CHECK-NEXT: );
+; CHECK-NEXT: } // callseq 0
+; CHECK-NEXT: ret;
+ call void @bar(ptr byval(%struct) @G)
+ ret void
+}
+
+declare void @bar(ptr)
diff --git a/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll b/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll
new file mode 100644
index 0000000000000..26967faa01a1b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=nvptx-tag-invariant-loads < %s -mcpu=sm_80 | FileCheck %s --check-prefix=OPT
+; RUN: llc -o - < %s -mcpu=sm_80 | FileCheck %s --check-prefix=PTX
+
+target triple = "nvptx-unknown-cuda"
+
+define ptx_kernel void @basic(ptr noalias readonly %a, ptr %out) {
+; OPT-LABEL: define ptx_kernel void @basic(
+; OPT-SAME: ptr noalias readonly [[A:%.*]], ptr [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT: [[A_GLOBAL:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; OPT-NEXT: [[VAL:%.*]] = load float, ptr addrspace(1) [[A_GLOBAL]], align 4, !invariant.load [[META0:![0-9]+]]
+; OPT-NEXT: store float [[VAL]], ptr [[OUT]], align 4
+; OPT-NEXT: ret void
+;
+; PTX-LABEL: basic(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<5>;
+; PTX-NEXT: .reg .b32 %f<2>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u32 %r1, [basic_param_0];
+; PTX-NEXT: cvta.to.global.u32 %r2, %r1;
+; PTX-NEXT: ld.param.u32 %r3, [basic_param_1];
+; PTX-NEXT: cvta.to.global.u32 %r4, %r3;
+; PTX-NEXT: ld.global.nc.f32 %f1, [%r2];
+; PTX-NEXT: st.global.f32 [%r4], %f1;
+; PTX-NEXT: ret;
+ %a_global = addrspacecast ptr %a to ptr addrspace(1)
+ %val = load float, ptr addrspace(1) %a_global
+ store float %val, ptr %out
+ ret void
+}
+
+define ptx_kernel void @select(ptr noalias readonly %a, ptr noalias readonly %b, i1 %c, ptr %out) {
+; OPT-LABEL: define ptx_kernel void @select(
+; OPT-SAME: ptr noalias readonly [[A:%.*]], ptr noalias readonly [[B:%.*]], i1 [[C:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[SELECT:%.*]] = select i1 [[C]], ptr [[A]], ptr [[B]]
+; OPT-NEXT: [[SELECT_GLOBAL:%.*]] = addrspacecast ptr [[SELECT]] to ptr addrspace(1)
+; OPT-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(1) [[SELECT_GLOBAL]], align 4, !invariant.load [[META0]]
+; OPT-NEXT: store i32 [[VAL]], ptr [[OUT]], align 4
+; OPT-NEXT: ret void
+;
+; PTX-LABEL: select(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b16 %rs<3>;
+; PTX-NEXT: .reg .b32 %r<9>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u8 %rs1, [select_param_2];
+; PTX-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; PTX-NEXT: ld.param.u32 %r1, [select_param_0];
+; PTX-NEXT: cvta.to.global.u32 %r2, %r1;
+; PTX-NEXT: ld.param.u32 %r3, [select_param_1];
+; PTX-NEXT: cvta.to.global.u32 %r4, %r3;
+; PTX-NEXT: ld.param.u32 %r5, [select_param_3];
+; PTX-NEXT: cvta.to.global.u32 %r6, %r5;
+; PTX-NEXT: selp.b32 %r7, %r2, %r4, %p1;
+; PTX-NEXT: ld.global.nc.u32 %r8, [%r7];
+; PTX-NEXT: st.global.u32 [%r6], %r8;
+; PTX-NEXT: ret;
+ %select = select i1 %c, ptr %a, ptr %b
+ %select_global = addrspacecast ptr %select to ptr addrspace(1)
+ %val = load i32, ptr addrspace(1) %select_global
+ store i32 %val, ptr %out
+ ret void
+}
+
+define void @not_kernel(ptr noalias readonly %a, ptr %out) {
+; OPT-LABEL: define void @not_kernel(
+; OPT-SAME: ptr noalias readonly [[A:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[A_GLOBAL:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; OPT-NEXT: [[VAL:%.*]] = load float, ptr addrspace(1) [[A_GLOBAL]], align 4
+; OPT-NEXT: store float [[VAL]], ptr [[OUT]], align 4
+; OPT-NEXT: ret void
+;
+; PTX-LABEL: not_kernel(
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b32 %f<2>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u32 %r1, [not_kernel_param_0];
+; PTX-NEXT: cvta.to.global.u32 %r2, %r1;
+; PTX-NEXT: ld.param.u32 %r3, [not_kernel_param_1];
+; PTX-NEXT: ld.global.f32 %f1, [%r2];
+; PTX-NEXT: st.f32 [%r3], %f1;
+; PTX-NEXT: ret;
+ %a_global = addrspacecast ptr %a to ptr addrspace(1)
+ %val = load float, ptr addrspace(1) %a_global
+ store float %val, ptr %out
+ ret void
+}
+
+%struct.S2 = type { i64, i64 }
+ at G = private unnamed_addr constant %struct.S2 { i64 1, i64 1 }, align 8
+
+define ptx_kernel void @global_load(ptr noalias readonly %a, i1 %c, ptr %out) {
+; OPT-LABEL: define ptx_kernel void @global_load(
+; OPT-SAME: ptr noalias readonly [[A:%.*]], i1 [[C:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[G_GLOBAL:%.*]] = addrspacecast ptr @G to ptr addrspace(1)
+; OPT-NEXT: [[A_GLOBAL:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
+; OPT-NEXT: [[SELECT:%.*]] = select i1 [[C]], ptr addrspace(1) [[G_GLOBAL]], ptr addrspace(1) [[A_GLOBAL]]
+; OPT-NEXT: [[VAL:%.*]] = load i64, ptr addrspace(1) [[SELECT]], align 8, !invariant.load [[META0]]
+; OPT-NEXT: store i64 [[VAL]], ptr [[OUT]], align 8
+; OPT-NEXT: ret void
+;
+; PTX-LABEL: global_load(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b16 %rs<3>;
+; PTX-NEXT: .reg .b32 %r<7>;
+; PTX-NEXT: .reg .b64 %rd<2>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u8 %rs1, [global_load_param_1];
+; PTX-NEXT: and.b16 %rs2, %rs1, 1;
+; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0;
+; PTX-NEXT: ld.param.u32 %r1, [global_load_param_0];
+; PTX-NEXT: cvta.to.global.u32 %r2, %r1;
+; PTX-NEXT: ld.param.u32 %r3, [global_load_param_2];
+; PTX-NEXT: cvta.to.global.u32 %r4, %r3;
+; PTX-NEXT: mov.b32 %r5, G;
+; PTX-NEXT: selp.b32 %r6, %r5, %r2, %p1;
+; PTX-NEXT: ld.global.nc.u64 %rd1, [%r6];
+; PTX-NEXT: st.global.u64 [%r4], %rd1;
+; PTX-NEXT: ret;
+ %g_global = addrspacecast ptr @G to ptr addrspace(1)
+ %a_global = addrspacecast ptr %a to ptr addrspace(1)
+ %select = select i1 %c, ptr addrspace(1) %g_global, ptr addrspace(1) %a_global
+ %val = load i64, ptr addrspace(1) %select
+ store i64 %val, ptr %out
+ ret void
+}
+;.
+; OPT: [[META0]] = !{}
+;.
More information about the llvm-commits
mailing list