[llvm] [NVPTX] Improved support for grid_constant (PR #97112)
Akshay Deodhar via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 28 14:08:39 PDT 2024
https://github.com/akshayrdeodhar created https://github.com/llvm/llvm-project/pull/97112
- Supports escaped grid_constant pointers less conservatively. Casts uses inside Calls, PtrToInts, Stores where the pointer is a _value operand_ to generic address space, immediately before the escape, while keeping other uses in the param address space
>From b0a9ec9da41d57e4c42d28580c338cc9ff74ea06 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Wed, 19 Jun 2024 03:28:54 +0000
Subject: [PATCH] [NVPTX] Improved support for grid_constant
- Supports escaped grid_constant pointers less conservatively. Casts
uses inside Calls, PtrToInts, Stores where the pointer is a _value
operand_ to generic address space, immediately before the escape,
while keeping other uses in the param address space
---
llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 173 ++++++++++---
llvm/lib/Target/NVPTX/NVPTXUtilities.cpp | 3 +-
.../CodeGen/NVPTX/lower-args-gridconstant.ll | 244 ++++++++++++++++--
3 files changed, 365 insertions(+), 55 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index e63c7a61c6f26..d5dffb8998a04 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -12,8 +12,7 @@
// http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces
//
// Kernel parameters are read-only and accessible only via ld.param
-// instruction, directly or via a pointer. Pointers to kernel
-// arguments can't be converted to generic address space.
+// instruction, directly or via a pointer.
//
// Device function parameters are directly accessible via
// ld.param/st.param, but taking the address of one returns a pointer
@@ -54,8 +53,10 @@
// ...
// }
//
-// 2. Convert pointers in a byval kernel parameter to pointers in the global
-// address space. As #2, it allows NVPTX to emit more ld/st.global. E.g.,
+// 2. Convert byval kernel parameters to pointers in the param address space
+// (so that NVPTX emits ld/st.param). Convert pointers *within* a byval
+// kernel parameter to pointers in the global address space. This allows
+// NVPTX to emit ld/st.global.
//
// struct S {
// int *x;
@@ -68,22 +69,68 @@
//
// "b" points to the global address space. In the IR level,
//
-// define void @foo({i32*, i32*}* byval %input) {
-// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
-// %b = load i32*, i32** %b_ptr
+// define void @foo(ptr byval %input) {
+// %b_ptr = getelementptr {ptr, ptr}, ptr %input, i64 0, i32 1
+// %b = load ptr, ptr %b_ptr
// ; use %b
// }
//
// becomes
//
// define void @foo({i32*, i32*}* byval %input) {
-// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1
-// %b = load i32*, i32** %b_ptr
-// %b_global = addrspacecast i32* %b to i32 addrspace(1)*
-// %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32*
+// %b_param = addrspacecat ptr %input to ptr addrspace(101)
+// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1
+// %b = load ptr, ptr addrspace(101) %b_ptr
+// %b_global = addrspacecast ptr %b to ptr addrspace(1)
// ; use %b_generic
// }
//
+// Create a local copy of kernel byval parameters used in a way that *might* mutate
+// the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters
+// are undefined behaviour, and don't require local copies.
+//
+// define void @foo(ptr byval(%struct.s) align 4 %input) {
+// store i32 42, ptr %input
+// ret void
+// }
+//
+// becomes
+//
+// define void @foo(ptr byval(%struct.s) align 4 %input) #1 {
+// %input1 = alloca %struct.s, align 4
+// %input2 = addrspacecast ptr %input to ptr addrspace(101)
+// %input3 = load %struct.s, ptr addrspace(101) %input2, align 4
+// store %struct.s %input3, ptr %input1, align 4
+// store i32 42, ptr %input1, align 4
+// ret void
+// }
+//
+// If %input were passed to a device function, or written to memory,
+// conservatively assume that %input gets mutated, and create a local copy.
+//
+// Convert param pointers to grid_constant byval kernel parameters that are
+// passed into calls (device functions, intrinsics, inline asm), or otherwise
+// "escape" (into stores/ptrtoints) to the generic address space, using the
+// `nvvm.ptr.param.to.gen` intrinsic, so that NVPTX emits cvta.param
+// (available for sm70+)
+//
+// define void @foo(ptr byval(%struct.s) %input) {
+// ; %input is a grid_constant
+// %call = call i32 @escape(ptr %input)
+// ret void
+// }
+//
+// becomes
+//
+// define void @foo(ptr byval(%struct.s) %input) {
+// %input1 = addrspacecast ptr %input to ptr addrspace(101)
+// ; the following intrinsic converts pointer to generic. We don't use an addrspacecast
+// ; to prevent generic -> param -> generic from getting cancelled out
+// %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1)
+// %call = call i32 @escape(ptr %input1.gen)
+// ret void
+// }
+//
// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
// cancel the addrspacecast pair this pass emits.
//===----------------------------------------------------------------------===//
@@ -166,19 +213,22 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args",
// ones in parameter AS, so we can access them using ld.param.
// =============================================================================
-// Replaces the \p OldUser instruction with the same in parameter AS.
-// Only Load and GEP are supported.
-static void convertToParamAS(Value *OldUser, Value *Param) {
- Instruction *I = dyn_cast<Instruction>(OldUser);
- assert(I && "OldUser must be an instruction");
+// For Loads, replaces the \p OldUse of the pointer with a Use of the same
+// pointer in parameter AS.
+// For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to
+// generic using cvta.param.
+static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
+ Instruction *I = dyn_cast<Instruction>(OldUse->getUser());
+ assert(I && "OldUse must be in an instruction");
struct IP {
+ Use *OldUse;
Instruction *OldInstruction;
Value *NewParam;
};
- SmallVector<IP> ItemsToConvert = {{I, Param}};
+ SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}};
SmallVector<Instruction *> InstructionsToDelete;
- auto CloneInstInParamAS = [](const IP &I) -> Value * {
+ auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * {
if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {
LI->setOperand(0, I.NewParam);
return LI;
@@ -202,6 +252,43 @@ static void convertToParamAS(Value *OldUser, Value *Param) {
// Just pass through the argument, the old ASC is no longer needed.
return I.NewParam;
}
+
+ if (GridConstant) {
+ auto GetParamAddrCastToGeneric =
+ [](Value *Addr, Instruction *OriginalUser) -> Value * {
+ PointerType *ReturnTy =
+ PointerType::get(OriginalUser->getContext(), ADDRESS_SPACE_GENERIC);
+ Function *CvtToGen = Intrinsic::getDeclaration(
+ OriginalUser->getModule(), Intrinsic::nvvm_ptr_param_to_gen,
+ {ReturnTy, PointerType::get(OriginalUser->getContext(),
+ ADDRESS_SPACE_PARAM)});
+
+ // Cast param address to generic address space
+ Value *CvtToGenCall =
+ CallInst::Create(CvtToGen, Addr, Addr->getName() + ".gen",
+ OriginalUser->getIterator());
+ return CvtToGenCall;
+ };
+
+ if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
+ I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI));
+ return CI;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
+ // byval address is being stored, cast it to generic
+ if (SI->getValueOperand() == I.OldUse->get())
+ SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI));
+ return SI;
+ }
+ if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
+ if (PI->getPointerOperand() == I.OldUse->get())
+ PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI));
+ return PI;
+ }
+ llvm_unreachable(
+ "Instruction unsupported even for grid_constant argument");
+ }
+
llvm_unreachable("Unsupported instruction");
};
@@ -213,8 +300,8 @@ static void convertToParamAS(Value *OldUser, Value *Param) {
// We've created a new instruction. Queue users of the old instruction to
// be converted and the instruction itself to be deleted. We can't delete
// the old instruction yet, because it's still in use by a load somewhere.
- for (Value *V : I.OldInstruction->users())
- ItemsToConvert.push_back({cast<Instruction>(V), NewInst});
+ for (Use &U : I.OldInstruction->uses())
+ ItemsToConvert.push_back({&U, cast<Instruction>(U.getUser()), NewInst});
InstructionsToDelete.push_back(I.OldInstruction);
}
@@ -272,6 +359,7 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
SmallVector<Load> Loads;
std::queue<LoadContext> Worklist;
Worklist.push({ArgInParamAS, 0});
+ bool IsGridConstant = isParamGridConstant(*Arg);
while (!Worklist.empty()) {
LoadContext Ctx = Worklist.front();
@@ -303,8 +391,14 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
continue;
}
+ // supported for grid_constant
+ if (IsGridConstant &&
+ (isa<CallInst>(CurUser) || isa<StoreInst>(CurUser) ||
+ isa<PtrToIntInst>(CurUser)))
+ continue;
+
llvm_unreachable("All users must be one of: load, "
- "bitcast, getelementptr.");
+ "bitcast, getelementptr, call, store, ptrtoint");
}
}
@@ -317,14 +411,15 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
Argument *Arg) {
+ bool IsGridConstant = isParamGridConstant(*Arg);
Function *Func = Arg->getParent();
BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
Type *StructType = Arg->getParamByValType();
assert(StructType && "Missing byval type");
- auto IsALoadChain = [&](Value *Start) {
+ auto AreSupportedUsers = [&](Value *Start) {
SmallVector<Value *, 16> ValuesToCheck = {Start};
- auto IsALoadChainInstr = [](Value *V) -> bool {
+ auto IsSupportedUse = [IsGridConstant](Value *V) -> bool {
if (isa<GetElementPtrInst>(V) || isa<BitCastInst>(V) || isa<LoadInst>(V))
return true;
// ASC to param space are OK, too -- we'll just strip them.
@@ -332,34 +427,43 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM)
return true;
}
+ // Simple calls and stores are supported for grid_constants
+ // writes to these pointers are undefined behaviour
+ if (IsGridConstant &&
+ (isa<CallInst>(V) || isa<StoreInst>(V) || isa<PtrToIntInst>(V)))
+ return true;
return false;
};
while (!ValuesToCheck.empty()) {
Value *V = ValuesToCheck.pop_back_val();
- if (!IsALoadChainInstr(V)) {
+ if (!IsSupportedUse(V)) {
LLVM_DEBUG(dbgs() << "Need a "
<< (isParamGridConstant(*Arg) ? "cast " : "copy ")
<< "of " << *Arg << " because of " << *V << "\n");
(void)Arg;
return false;
}
- if (!isa<LoadInst>(V))
+ if (!isa<LoadInst>(V) && !isa<CallInst>(V) && !isa<StoreInst>(V) &&
+ !isa<PtrToIntInst>(V))
llvm::append_range(ValuesToCheck, V->users());
}
return true;
};
- if (llvm::all_of(Arg->users(), IsALoadChain)) {
+ if (llvm::all_of(Arg->users(), AreSupportedUsers)) {
// Convert all loads and intermediate operations to use parameter AS and
// skip creation of a local copy of the argument.
- SmallVector<User *, 16> UsersToUpdate(Arg->users());
+ SmallVector<Use *, 16> UsesToUpdate;
+ for (Use &U : Arg->uses())
+ UsesToUpdate.push_back(&U);
+
Value *ArgInParamAS = new AddrSpaceCastInst(
Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
FirstInst);
- for (Value *V : UsersToUpdate)
- convertToParamAS(V, ArgInParamAS);
- LLVM_DEBUG(dbgs() << "No need to copy " << *Arg << "\n");
+ for (Use *U : UsesToUpdate)
+ convertToParamAS(U, ArgInParamAS, IsGridConstant);
+ LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
const auto *TLI =
cast<NVPTXTargetLowering>(TM.getSubtargetImpl()->getTargetLowering());
@@ -376,16 +480,11 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
// temporary copy. When a pointer might have escaped, conservatively replace
// all of its uses (which might include a device function call) with a cast
// to the generic address space.
- // TODO: only cast byval grid constant parameters at use points that need
- // generic address (e.g., merging parameter pointers with other address
- // space, or escaping to call-sites, inline-asm, memory), and use the
- // parameter address space for normal loads.
IRBuilder<> IRB(&Func->getEntryBlock().front());
// Cast argument to param address space
- auto *CastToParam =
- cast<AddrSpaceCastInst>(IRB.CreateAddrSpaceCast(
- Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param"));
+ auto *CastToParam = cast<AddrSpaceCastInst>(IRB.CreateAddrSpaceCast(
+ Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param"));
// Cast param address to generic address space. We do not use an
// addrspacecast to generic here, because, LLVM considers `Arg` to be in the
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index e4b2ec868519c..80361744fd5b6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -210,7 +210,8 @@ bool isParamGridConstant(const Value &V) {
if (const Argument *Arg = dyn_cast<Argument>(&V)) {
// "grid_constant" counts argument indices starting from 1
if (Arg->hasByValAttr() &&
- argHasNVVMAnnotation(*Arg, "grid_constant", /*StartArgIndexAtOne*/true)) {
+ argHasNVVMAnnotation(*Arg, "grid_constant",
+ /*StartArgIndexAtOne*/ true)) {
assert(isKernelFunction(*Arg->getParent()) &&
"only kernel arguments can be grid_constant");
return true;
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index 46f54e0e6f4d4..f6db9c429dba5 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT
; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX
@@ -67,22 +67,22 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
; PTX: mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0;
; PTX: mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2;
; PTX: mov.{{.*}} [[RD3:%.*]], [[RD2]];
-; PTX: cvta.param.{{.*}} [[RD4:%.*]], [[RD3]];
-; PTX: mov.u64 [[RD5:%.*]], [[RD1]];
-; PTX: cvta.param.{{.*}} [[RD6:%.*]], [[RD5]];
+; PTX: mov.{{.*}} [[RD4:%.*]], [[RD1]];
+; PTX: cvta.param.{{.*}} [[RD5:%.*]], [[RD4]];
+; PTX: cvta.param.{{.*}} [[RD6:%.*]], [[RD3]];
; PTX: {
-; PTX: st.param.b64 [param0+0], [[RD6]];
-; PTX: st.param.b64 [param2+0], [[RD4]];
+; PTX: st.param.b64 [param0+0], [[RD5]];
+; PTX: st.param.b64 [param2+0], [[RD6]];
;
; OPT-LABEL: define void @multiple_grid_const_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) {
-; OPT-NOT: alloca i32
; OPT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; OPT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
-; OPT-NOT: alloca [[STRUCT_S]]
; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NOT: alloca %struct.s
+; OPT: [[A_ADDR:%.*]] = alloca i32, align 4
; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, ptr [[B_PARAM_GEN]])
+; OPT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
;
%a.addr = alloca i32, align 4
store i32 %a, ptr %a.addr, align 4
@@ -111,17 +111,19 @@ define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %
define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
; PTX-LABEL: grid_const_inlineasm_escape(
; PTX-NOT .local
-; PTX: cvta.param.u64 [[RD2:%.*]], {{.*}}
-; PTX: add.{{.*}} [[RD3:%.*]], [[RD2]], 4;
-; PTX: add.s64 [[RD1:%.*]], [[RD2]], [[RD3]];
+; PTX: add.{{.*}} [[RD2:%.*]], [[RD1:%.*]], 4;
+; PTX: cvta.param.u64 [[RD4:%.*]], [[RD2]]
+; PTX: cvta.param.u64 [[RD3:%.*]], [[RD1]]
+; PTX: add.s64 [[RD5:%.*]], [[RD3]], [[RD4]];
;
; OPT-LABEL: define void @grid_const_inlineasm_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) {
; OPT-NOT: alloca [[STRUCT_S]]
; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT_PARAM_GEN]], i32 0, i32 0
-; OPT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT_PARAM_GEN]], i32 0, i32 1
+; OPT: [[TMPPTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 0
+; OPT: [[TMPPTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 1
+; OPT: [[TMPPTR22_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR22]])
+; OPT: [[TMPPTR13_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR13]])
; OPT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
;
%tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
@@ -131,10 +133,200 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
ret void
}
+define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
+; PTX-LABEL: grid_const_partial_escape(
+; PTX-NOT: .local
+; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escape_param_0];
+; PTX: add.{{.*}}
+; PTX: cvta.param.u64 [[RD3:%.*]], {{%.*}}
+; PTX: st.param.{{.*}} [param0+0], [[RD3]]
+; PTX: call
+;
+; OPT-LABEL: define void @grid_const_partial_escape(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
+; OPT-NOT: alloca
+; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT1]], align 4
+; OPT: [[TWICE:%.*]] = add i32 [[VAL]], [[VAL]]
+; OPT: store i32 [[TWICE]]
+; OPT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
+; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
+; OPT: ret void
+;
+ %val = load i32, ptr %input
+ %twice = add i32 %val, %val
+ store i32 %twice, ptr %output
+ %call = call i32 @escape(ptr %input)
+ ret void
+}
+
+define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
+; PTX-LABEL: grid_const_partial_escapemem(
+; PTX: {
+; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escapemem_param_0];
+; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_partial_escapemem_param_0+4];
+; PTX: cvta.param.{{.*}} [[RD5:%.*]], {{%.*}};
+; PTX: st.global.{{.*}} [{{.*}}], [[RD5]];
+; PTX: add.s32 [[R3:%.*]], [[R1]], [[R2]]
+; PTX: st.param.{{.*}} [param0+0], [[RD5]]
+; PTX: escape
+; OPT-LABEL: define i32 @grid_const_partial_escapemem(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
+; OPT-NOT: alloca
+; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT: [[PTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 0
+; OPT: [[VAL1:%.*]] = load i32, ptr addrspace(101) [[PTR13]], align 4
+; OPT: [[PTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 1
+; OPT: [[VAL2:%.*]] = load i32, ptr addrspace(101) [[PTR22]], align 4
+; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT: store ptr [[INPUT1]]
+; OPT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
+; OPT: [[PTR1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[PTR13]])
+; OPT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
+;
+ %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
+ %val1 = load i32, ptr %ptr1
+ %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
+ %val2 = load i32, ptr %ptr2
+ store ptr %input, ptr %output
+ %add = add i32 %val1, %val2
+ %call2 = call i32 @escape(ptr %ptr1)
+ ret i32 %add
+}
+
+define void @grid_const_phi_escape(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
+; PTX-LABEL: grid_const_phi_escape(
+; PTX: cvta.param.{{.*}} [[RD1:%.*]], {{.*}}
+; PTX: @[[P1:%.*]] bra $L__BB[[TARGET_LABEL:[_0-9]+]];
+; PTX: $L__BB[[TARGET_LABEL]]:
+; PTX: ld.{{.*}} [[R1:%.*]], [[[RD1]]];
+;
+; OPT-LABEL: define void @grid_const_phi_escape(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr {{%.*}}) {
+; OPT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT: br i1 {{.*}}, label %[[FIRST:.*]], label %[[SECOND:.*]]
+; OPT: [[FIRST]]:
+; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; OPT: br label %[[MERGE:.*]]
+; OPT: [[SECOND]]:
+; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
+; OPT: br label %[[MERGE]]
+; OPT: [[MERGE]]:
+; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; OPT-NOT: load i32, ptr addrspace(101)
+; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+;
+
+ %val = load i32, ptr %inout
+ %less = icmp slt i32 %val, 0
+ br i1 %less, label %first, label %second
+first:
+ %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0
+ br label %merge
+second:
+ %ptr2 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 1
+ br label %merge
+merge:
+ %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second]
+ %valloaded = load i32, ptr %ptrnew
+ store i32 %valloaded, ptr %inout
+ ret void
+}
+
+; NOTE: %input2 is *not* grid_constant
+define void @grid_const_phi_escape2(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_phi_escape2(
+; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_phi_escape2_param_1+4];
+; PTX: @[[P1:%.*]] bra $L__BB[[LABEL:[_0-9]+]];
+; PTX: cvta.param.u64 [[RD1:%.*]], [[RD2:%.*]];
+; PTX: ld.u32 [[R1]], [[[RD1]]];
+; PTX: $L__BB[[LABEL]]:
+; PTX: st.global.u32 [[[RD3:%.*]]], [[R1]]
+; OPT-LABEL: define void @grid_const_phi_escape2(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr {{%.*}}) {
+; OPT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
+; OPT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; OPT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
+; OPT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
+; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
+; OPT: br i1 [[LESS:%.*]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+; OPT: [[FIRST]]:
+; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
+; OPT: br label %[[MERGE:.*]]
+; OPT: [[SECOND]]:
+; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
+; OPT: br label %[[MERGE]]
+; OPT: [[MERGE]]:
+; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+;
+ %val = load i32, ptr %inout
+ %less = icmp slt i32 %val, 0
+ br i1 %less, label %first, label %second
+first:
+ %ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0
+ br label %merge
+second:
+ %ptr2 = getelementptr inbounds %struct.s, ptr %input2, i32 0, i32 1
+ br label %merge
+merge:
+ %ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second]
+ %valloaded = load i32, ptr %ptrnew
+ store i32 %valloaded, ptr %inout
+ ret void
+}
+
+; NOTE: %input2 is *not* grid_constant
+define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_select_escape(
+; PTX: cvta.param.{{.*}} [[RD2:%.*]], [[RD1:%.*]]
+; PTX: setp.lt.{{.*}} [[P1:%.*]], {{%.*}}, 0
+; PTX: add.{{.*}} [[RD3:%.*]], %SP, 0;
+; PTX: selp.{{.*}} [[RD4:%.*]], [[RD2]], [[RD3]], [[P1]];
+; PTX: ld.u32 {{%.*}}, [[[RD4]]];
+; OPT-LABEL: define void @grid_const_select_escape(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) {
+; OPT: [[INPUT24:%.*]] = alloca i32, align 4
+; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
+; OPT: load i32, ptr [[INOUT]]
+; OPT: [[PTRNEW:%.*]] = select i1 [[LESS:%.*]], ptr [[INPUT11]], ptr [[INPUT24]]
+; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+;
+ %val = load i32, ptr %inout
+ %less = icmp slt i32 %val, 0
+ %ptrnew = select i1 %less, ptr %input1, ptr %input2
+ %valloaded = load i32, ptr %ptrnew
+ store i32 %valloaded, ptr %inout
+ ret void
+}
+
+define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
+; PTX-LABEL: grid_const_ptrtoint(
+; PTX-NOT: .local
+; PTX: ld.param.{{.*}} {{%.*}}, [grid_const_ptrtoint_param_0];
+; PTX: cvta.param.u64 [[RD1:%.*]], {{%.*}}
+; PTX: cvt.u32.u64 {{%.*}}, [[RD1]]
+; OPT-LABEL: define i32 @grid_const_ptrtoint(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) {
+; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT2]]
+; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
+ %val = load i32, ptr %input
+ %ptrval = ptrtoint ptr %input to i32
+ %keepalive = add i32 %val, %ptrval
+ ret i32 %keepalive
+}
+
+
+
+declare dso_local void @dummy() local_unnamed_addr
declare dso_local ptr @escape(ptr) local_unnamed_addr
declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
-!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11}
+!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23}
!0 = !{ptr @grid_const_int, !"kernel", i32 1, !"grid_constant", !1}
!1 = !{i32 1}
@@ -153,3 +345,21 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
!10 = !{ptr @grid_const_inlineasm_escape, !"kernel", i32 1, !"grid_constant", !11}
!11 = !{i32 1}
+
+!12 = !{ptr @grid_const_partial_escape, !"kernel", i32 1, !"grid_constant", !13}
+!13 = !{i32 1}
+
+!14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15}
+!15 = !{i32 1}
+
+!16 = !{ptr @grid_const_phi_escape, !"kernel", i32 1, !"grid_constant", !17}
+!17 = !{i32 1}
+
+!18 = !{ptr @grid_const_phi_escape2, !"kernel", i32 1, !"grid_constant", !19}
+!19 = !{i32 1}
+
+!20 = !{ptr @grid_const_select_escape, !"kernel", i32 1, !"grid_constant", !21}
+!21 = !{i32 1}
+
+!22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23}
+!23 = !{i32 1}
More information about the llvm-commits
mailing list