[llvm] 8db31e9 - [NVPTX] Do not addrspacecast AS-specific kernel arguments.
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 26 11:34:03 PST 2023
Author: Artem Belevich
Date: 2023-01-26T11:29:20-08:00
New Revision: 8db31e932d33b63449d3727c0496fce29883ebb1
URL: https://github.com/llvm/llvm-project/commit/8db31e932d33b63449d3727c0496fce29883ebb1
DIFF: https://github.com/llvm/llvm-project/commit/8db31e932d33b63449d3727c0496fce29883ebb1.diff
LOG: [NVPTX] Do not addrspacecast AS-specific kernel arguments.
Fixes https://github.com/llvm/llvm-project/issues/46954
The assumption that generic pointers passed to a CUDA kernel is CUDA-specific
and should not be applied to non-CUDA compilations. Addrspacecasts to global AS
and back should never be applied to AS-specific pointers.
In order to make tests actually do the testing for non-CUDA compilation, we need
to get TargetMachine from the TargetPassConfig, instead of passing it explicitly
as a pass constructor argument.
Differential Revision: https://reviews.llvm.org/D142581
Added:
Modified:
llvm/lib/Target/NVPTX/NVPTX.h
llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
llvm/test/CodeGen/NVPTX/lower-args.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 3bd9a7f08f549..95184420f6087 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -44,7 +44,7 @@ FunctionPass *createNVVMReflectPass(unsigned int SmVersion);
MachineFunctionPass *createNVPTXPrologEpilogPass();
MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
FunctionPass *createNVPTXImageOptimizerPass();
-FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM);
+FunctionPass *createNVPTXLowerArgsPass();
FunctionPass *createNVPTXLowerAllocaPass();
MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 306c485b87913..ee82f7b903daa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -93,10 +93,12 @@
#include "NVPTXTargetMachine.h"
#include "NVPTXUtilities.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include <numeric>
#include <queue>
@@ -113,11 +115,11 @@ namespace {
class NVPTXLowerArgs : public FunctionPass {
bool runOnFunction(Function &F) override;
- bool runOnKernelFunction(Function &F);
- bool runOnDeviceFunction(Function &F);
+ bool runOnKernelFunction(const NVPTXTargetMachine &TM, Function &F);
+ bool runOnDeviceFunction(const NVPTXTargetMachine &TM, Function &F);
// handle byval parameters
- void handleByValParam(Argument *Arg);
+ void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg);
// Knowing Ptr must point to the global address space, this function
// addrspacecasts Ptr to global and then back to generic. This allows
// NVPTXInferAddressSpaces to fold the global-to-generic cast into
@@ -126,21 +128,23 @@ class NVPTXLowerArgs : public FunctionPass {
public:
static char ID; // Pass identification, replacement for typeid
- NVPTXLowerArgs(const NVPTXTargetMachine *TM = nullptr)
- : FunctionPass(ID), TM(TM) {}
+ NVPTXLowerArgs() : FunctionPass(ID) {}
StringRef getPassName() const override {
return "Lower pointer arguments of CUDA kernels";
}
-
-private:
- const NVPTXTargetMachine *TM;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ }
};
} // namespace
char NVPTXLowerArgs::ID = 1;
-INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args",
- "Lower arguments (NVPTX)", false, false)
+INITIALIZE_PASS_BEGIN(NVPTXLowerArgs, "nvptx-lower-args",
+ "Lower arguments (NVPTX)", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args",
+ "Lower arguments (NVPTX)", false, false)
// =============================================================================
// If the function had a byval struct ptr arg, say foo(%struct.x* byval %d),
@@ -310,7 +314,8 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
}
}
-void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
+void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
+ Argument *Arg) {
Function *Func = Arg->getParent();
Instruction *FirstInst = &(Func->getEntryBlock().front());
Type *StructType = Arg->getParamByValType();
@@ -354,12 +359,8 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
convertToParamAS(V, ArgInParamAS);
LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n");
- // Further optimizations require target lowering info.
- if (!TM)
- return;
-
const auto *TLI =
- cast<NVPTXTargetLowering>(TM->getSubtargetImpl()->getTargetLowering());
+ cast<NVPTXTargetLowering>(TM.getSubtargetImpl()->getTargetLowering());
adjustByValArgAlignment(Arg, ArgInParamAS, TLI);
@@ -390,7 +391,7 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
}
void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
- if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL)
+ if (Ptr->getType()->getPointerAddressSpace() != ADDRESS_SPACE_GENERIC)
return;
// Deciding where to emit the addrspacecast pair.
@@ -420,8 +421,9 @@ void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
// =============================================================================
// Main function for this pass.
// =============================================================================
-bool NVPTXLowerArgs::runOnKernelFunction(Function &F) {
- if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
+bool NVPTXLowerArgs::runOnKernelFunction(const NVPTXTargetMachine &TM,
+ Function &F) {
+ if (TM.getDrvInterface() == NVPTX::CUDA) {
// Mark pointers in byval structs as global.
for (auto &B : F) {
for (auto &I : B) {
@@ -444,8 +446,8 @@ bool NVPTXLowerArgs::runOnKernelFunction(Function &F) {
for (Argument &Arg : F.args()) {
if (Arg.getType()->isPointerTy()) {
if (Arg.hasByValAttr())
- handleByValParam(&Arg);
- else if (TM && TM->getDrvInterface() == NVPTX::CUDA)
+ handleByValParam(TM, &Arg);
+ else if (TM.getDrvInterface() == NVPTX::CUDA)
markPointerAsGlobal(&Arg);
}
}
@@ -453,19 +455,20 @@ bool NVPTXLowerArgs::runOnKernelFunction(Function &F) {
}
// Device functions only need to copy byval args into local memory.
-bool NVPTXLowerArgs::runOnDeviceFunction(Function &F) {
+bool NVPTXLowerArgs::runOnDeviceFunction(const NVPTXTargetMachine &TM,
+ Function &F) {
LLVM_DEBUG(dbgs() << "Lowering function args of " << F.getName() << "\n");
for (Argument &Arg : F.args())
if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
- handleByValParam(&Arg);
+ handleByValParam(TM, &Arg);
return true;
}
bool NVPTXLowerArgs::runOnFunction(Function &F) {
- return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
-}
+ auto &TM = getAnalysis<TargetPassConfig>().getTM<NVPTXTargetMachine>();
-FunctionPass *
-llvm::createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM) {
- return new NVPTXLowerArgs(TM);
+ return isKernelFunction(F) ? runOnKernelFunction(TM, F)
+ : runOnDeviceFunction(TM, F);
}
+
+FunctionPass *llvm::createNVPTXLowerArgsPass() { return new NVPTXLowerArgs(); }
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 36814d9f57427..4f7762acecd48 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -326,7 +326,7 @@ void NVPTXPassConfig::addIRPasses() {
// NVPTXLowerArgs is required for correctness and should be run right
// before the address space inference passes.
- addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
+ addPass(createNVPTXLowerArgsPass());
if (getOptLevel() != CodeGenOpt::None) {
addAddressSpaceInferencePasses();
addStraightLineScalarOptimizationPasses();
diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll
index c2d6d3432680b..657c9050c4e44 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args.ll
@@ -1,5 +1,7 @@
-; RUN: opt < %s -S -nvptx-lower-args | FileCheck %s --check-prefix IR
-; RUN: llc < %s -mcpu=sm_20 | FileCheck %s --check-prefix PTX
+; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,IR,IRC
+; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-nvcl | FileCheck %s --check-prefixes COMMON,IR,IRO
+; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes COMMON,PTX,PTXC
+; RUN: llc < %s -mcpu=sm_20 --mtriple nvptx64-nvidia-nvcl| FileCheck %s --check-prefixes COMMON,PTX,PTXO
; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
@@ -9,6 +11,7 @@ target triple = "nvptx64-nvidia-cuda"
%class.inner = type { ptr, ptr }
; Check that nvptx-lower-args preserves arg alignment
+; COMMON-LABEL: load_alignment
define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 %arg) {
entry:
; IR: load %class.outer, ptr addrspace(101)
@@ -30,5 +33,43 @@ entry:
ret void
}
+
+; COMMON-LABEL: ptr_generic
+define void @ptr_generic(ptr %out, ptr %in) {
+; IRC: %in3 = addrspacecast ptr %in to ptr addrspace(1)
+; IRC: %in4 = addrspacecast ptr addrspace(1) %in3 to ptr
+; IRC: %out1 = addrspacecast ptr %out to ptr addrspace(1)
+; IRC: %out2 = addrspacecast ptr addrspace(1) %out1 to ptr
+; PTXC: cvta.to.global.u64
+; PTXC: cvta.to.global.u64
+; PTXC: ld.global.u32
+; PTXC: st.global.u32
+
+; OpenCL can't make assumptions about incoming pointer, so we should generate
+; generic pointers load/store.
+; IRO-NOT: addrspacecast
+; PTXO-NOT: cvta.to.global
+; PTXO: ld.u32
+; PTXO: st.u32
+ %v = load i32, ptr %in, align 4
+ store i32 %v, ptr %out, align 4
+ ret void
+}
+
+; COMMON-LABEL: ptr_nongeneric
+define void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(4) %in) {
+; IR-NOT: addrspacecast
+; PTX-NOT: cvta.to.global
+; PTX: ld.const.u32
+; PTX st.global.u32
+ %v = load i32, ptr addrspace(4) %in, align 4
+ store i32 %v, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+
; Function Attrs: convergent nounwind
declare dso_local ptr @escape(ptr) local_unnamed_addr
+!nvvm.annotations = !{!0, !1}
+!0 = !{ptr @ptr_generic, !"kernel", i32 1}
+!1 = !{ptr @ptr_nongeneric, !"kernel", i32 1}
More information about the llvm-commits
mailing list