[llvm] [NVPTX] Improve copy avoidance during lowering. (PR #106423)
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 28 12:48:06 PDT 2024
https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/106423
>From b5bc3ad2ae818899c917849eaac9fe8856c6c9a3 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Tue, 27 Aug 2024 16:20:35 -0700
Subject: [PATCH 1/3] [PtrUseVisitor] Allow using Argument as a starting point
Argument is another possible starting point for the pointer traversal,
and PtrUseVisitor should be able to handle it.
---
llvm/include/llvm/Analysis/PtrUseVisitor.h | 7 +++++--
llvm/lib/Analysis/PtrUseVisitor.cpp | 2 +-
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index b6cc14d2077af0..539d302bb70a1b 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -157,7 +157,7 @@ class PtrUseVisitorBase {
///
/// This will visit the users with the same offset of the current visit
/// (including an unknown offset if that is the current state).
- void enqueueUsers(Instruction &I);
+ void enqueueUsers(Value &I);
/// Walk the operands of a GEP and adjust the offset as appropriate.
///
@@ -208,11 +208,14 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
/// Recursively visit the uses of the given pointer.
/// \returns An info struct about the pointer. See \c PtrInfo for details.
- PtrInfo visitPtr(Instruction &I) {
+ /// We may also need to process Argument pointers, so the input uses is
+ /// a common Value type.
+ PtrInfo visitPtr(Value &I) {
// This must be a pointer type. Get an integer type suitable to hold
// offsets on this pointer.
// FIXME: Support a vector of pointers.
assert(I.getType()->isPointerTy());
+ assert(isa<Instruction>(I) || isa<Argument>(I));
IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(I.getType()));
IsOffsetKnown = true;
Offset = APInt(IntIdxTy->getBitWidth(), 0);
diff --git a/llvm/lib/Analysis/PtrUseVisitor.cpp b/llvm/lib/Analysis/PtrUseVisitor.cpp
index 49304818d7efed..9c79546f491eff 100644
--- a/llvm/lib/Analysis/PtrUseVisitor.cpp
+++ b/llvm/lib/Analysis/PtrUseVisitor.cpp
@@ -17,7 +17,7 @@
using namespace llvm;
-void detail::PtrUseVisitorBase::enqueueUsers(Instruction &I) {
+void detail::PtrUseVisitorBase::enqueueUsers(Value &I) {
for (Use &U : I.uses()) {
if (VisitedUses.insert(&U).second) {
UseToVisit NewU = {
>From a8835f7c339c4f973eba977361dac54e405f1d9f Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 23 Aug 2024 17:36:37 -0700
Subject: [PATCH 2/3] [NVPTX] Improve copy avoidance during lowering.
On newer GPUs, where `cvta.param` instruction is available we can avoid making
byval arguments when their pointers are used in a few more cases, even
when __grid_constant__ is not specified.
- phi
- select
- memcpy from the parameter.
---
llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 267 ++++++---
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 3 +
.../CodeGen/NVPTX/lower-args-gridconstant.ll | 556 ++++++++++++------
llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 402 ++++++++-----
4 files changed, 839 insertions(+), 389 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 1205ad4c6b008f..243f39d8a16719 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -79,15 +79,15 @@
//
// define void @foo({i32*, i32*}* byval %input) {
// %b_param = addrspacecat ptr %input to ptr addrspace(101)
-// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1
-// %b = load ptr, ptr addrspace(101) %b_ptr
-// %b_global = addrspacecast ptr %b to ptr addrspace(1)
-// ; use %b_generic
+// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0,
+// i32 1 %b = load ptr, ptr addrspace(101) %b_ptr %b_global = addrspacecast
+// ptr %b to ptr addrspace(1) ; use %b_generic
// }
//
-// Create a local copy of kernel byval parameters used in a way that *might* mutate
-// the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters
-// are undefined behaviour, and don't require local copies.
+// Create a local copy of kernel byval parameters used in a way that *might*
+// mutate the parameter, by storing it in an alloca. Mutations to
+// "grid_constant" parameters are undefined behaviour, and don't require
+// local copies.
//
// define void @foo(ptr byval(%struct.s) align 4 %input) {
// store i32 42, ptr %input
@@ -124,11 +124,11 @@
//
// define void @foo(ptr byval(%struct.s) %input) {
// %input1 = addrspacecast ptr %input to ptr addrspace(101)
-// ; the following intrinsic converts pointer to generic. We don't use an addrspacecast
-// ; to prevent generic -> param -> generic from getting cancelled out
-// %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1)
-// %call = call i32 @escape(ptr %input1.gen)
-// ret void
+// ; the following intrinsic converts pointer to generic. We don't use an
+// addrspacecast ; to prevent generic -> param -> generic from getting
+// cancelled out %input1.gen = call ptr
+// @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1) %call =
+// call i32 @escape(ptr %input1.gen) ret void
// }
//
// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
@@ -139,16 +139,21 @@
#include "NVPTX.h"
#include "NVPTXTargetMachine.h"
#include "NVPTXUtilities.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include <numeric>
#include <queue>
@@ -217,7 +222,8 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args",
// pointer in parameter AS.
// For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to
// generic using cvta.param.
-static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
+static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam,
+ bool IsGridConstant) {
Instruction *I = dyn_cast<Instruction>(OldUse->getUser());
assert(I && "OldUse must be in an instruction");
struct IP {
@@ -228,7 +234,8 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}};
SmallVector<Instruction *> InstructionsToDelete;
- auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * {
+ auto CloneInstInParamAS = [HasCvtaParam,
+ IsGridConstant](const IP &I) -> Value * {
if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {
LI->setOperand(0, I.NewParam);
return LI;
@@ -252,8 +259,25 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
// Just pass through the argument, the old ASC is no longer needed.
return I.NewParam;
}
+ if (auto *MI = dyn_cast<MemTransferInst>(I.OldInstruction)) {
+ if (MI->getRawSource() == I.OldUse->get()) {
+ // convert to memcpy/memmove from param space.
+ IRBuilder<> Builder(I.OldInstruction);
+ Intrinsic::ID ID = MI->getIntrinsicID();
+
+ CallInst *B = Builder.CreateMemTransferInst(
+ ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam,
+ MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
+ for (unsigned I : {0, 1})
+ if (uint64_t Bytes = MI->getParamDereferenceableBytes(I))
+ B->addDereferenceableParamAttr(I, Bytes);
+ return B;
+ }
+ // We may be able to handle other cases if the argument is
+ // __grid_constant__
+ }
- if (GridConstant) {
+ if (HasCvtaParam) {
auto GetParamAddrCastToGeneric =
[](Value *Addr, Instruction *OriginalUser) -> Value * {
PointerType *ReturnTy =
@@ -269,24 +293,44 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
OriginalUser->getIterator());
return CvtToGenCall;
};
-
- if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
- I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI));
- return CI;
+ auto *ParamInGenericAS =
+ GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction);
+
+ // phi/select could use generic arg pointers w/o __grid_constant__
+ if (auto *PHI = dyn_cast<PHINode>(I.OldInstruction)) {
+ for (auto [Idx, V] : enumerate(PHI->incoming_values())) {
+ if (V.get() == I.OldUse->get())
+ PHI->setIncomingValue(Idx, ParamInGenericAS);
+ }
}
- if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
- // byval address is being stored, cast it to generic
- if (SI->getValueOperand() == I.OldUse->get())
- SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI));
- return SI;
+ if (auto *SI = dyn_cast<SelectInst>(I.OldInstruction)) {
+ if (SI->getTrueValue() == I.OldUse->get())
+ SI->setTrueValue(ParamInGenericAS);
+ if (SI->getFalseValue() == I.OldUse->get())
+ SI->setFalseValue(ParamInGenericAS);
}
- if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
- if (PI->getPointerOperand() == I.OldUse->get())
- PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI));
- return PI;
+
+ // Escapes or writes can only use generic param pointers if
+ // __grid_constant__ is in effect.
+ if (IsGridConstant) {
+ if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
+ I.OldUse->set(ParamInGenericAS);
+ return CI;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
+ // byval address is being stored, cast it to generic
+ if (SI->getValueOperand() == I.OldUse->get())
+ SI->setOperand(0, ParamInGenericAS);
+ return SI;
+ }
+ if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
+ if (PI->getPointerOperand() == I.OldUse->get())
+ PI->setOperand(0, ParamInGenericAS);
+ return PI;
+ }
+ // TODO: iIf we allow stores, we should allow memcpy/memset to
+ // parameter, too.
}
- llvm_unreachable(
- "Instruction unsupported even for grid_constant argument");
}
llvm_unreachable("Unsupported instruction");
@@ -409,49 +453,121 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
}
}
+namespace {
+struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
+ using Base = PtrUseVisitor<ArgUseChecker>;
+
+ bool IsGridConstant;
+ SmallPtrSet<Value *, 16> AllArgUsers;
+ // Set of phi/select instructions using the Arg
+ SmallPtrSet<Instruction *, 4> Conditionals;
+
+ ArgUseChecker(const DataLayout &DL, bool IsGridConstant)
+ : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {}
+
+ PtrInfo visitArgPtr(Argument &A) {
+ assert(A.getType()->isPointerTy());
+ IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(A.getType()));
+ IsOffsetKnown = false;
+ Offset = APInt(IntIdxTy->getBitWidth(), 0);
+ PI.reset();
+ AllArgUsers.clear();
+ Conditionals.clear();
+
+ LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n");
+ // Enqueue the uses of this pointer.
+ enqueueUsers(A);
+ AllArgUsers.insert(&A);
+
+ // Visit all the uses off the worklist until it is empty.
+ // Note that unlike PtrUseVisitor we're intentionally do not track offset.
+ // We're only interested in how we use the pointer.
+ while (!(Worklist.empty() || PI.isAborted())) {
+ UseToVisit ToVisit = Worklist.pop_back_val();
+ U = ToVisit.UseAndIsOffsetKnown.getPointer();
+ Instruction *I = cast<Instruction>(U->getUser());
+ AllArgUsers.insert(I);
+ if (isa<PHINode>(I) || isa<SelectInst>(I))
+ Conditionals.insert(I);
+ LLVM_DEBUG(dbgs() << "Processing " << *I << "\n");
+ Base::visit(I);
+ }
+ if (PI.isEscaped())
+ LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst()
+ << "\n");
+ else if (PI.isAborted())
+ LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst()
+ << "\n");
+ LLVM_DEBUG(dbgs() << "Traversed " << AllArgUsers.size() << " with "
+ << Conditionals.size() << " conditionals\n");
+ return PI;
+ }
+
+ void visitStoreInst(StoreInst &SI) {
+ // Storing the pointer escapes it.
+ if (U->get() == SI.getValueOperand())
+ return PI.setEscapedAndAborted(&SI);
+ // Writes to the pointer are UB w/ __gid_constant__, but do not force a
+ // copy.
+ if (!IsGridConstant)
+ return PI.setAborted(&SI);
+ }
+
+ void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+ // ASC to param space are no-ops and do not need a copy
+ if (ASC.getDestAddressSpace() != ADDRESS_SPACE_PARAM)
+ return PI.setEscapedAndAborted(&ASC);
+ Base::visitAddrSpaceCastInst(ASC);
+ }
+
+ void visitPtrToIntInst(PtrToIntInst &I) {
+ if (IsGridConstant)
+ return;
+ Base::visitPtrToIntInst(I);
+ }
+ void visitPHINodeOrSelectInst(Instruction &I) {
+ assert(isa<PHINode>(I) || isa<SelectInst>(I));
+ }
+ // PHI and select just pass through the pointers.
+ void visitPHINode(PHINode &PN) { enqueueUsers(PN); }
+ void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); }
+
+ void visitMemTransferInst(MemTransferInst &II) {
+ if (*U == II.getRawDest() && !IsGridConstant)
+ PI.setAborted(&II);
+
+ // TODO: memcpy from arg is OK as it can get unrolled into ld.param.
+ // However, memcpys are currently expected to be unrolled before we
+ // get here, so we never see them in practice, and we do not currently
+ // handle them when we convert IR to access param space directly. So,
+ // we'll mark it as an escape for now. It would still force a copy on
+ // pre-sm_70 GPUs where we can't take address of a parameter w/o a copy.
+ //
+ // PI.setEscaped(&II);
+ }
+
+ void visitMemSetInst(MemSetInst &II) {
+ if (*U == II.getRawDest() && !IsGridConstant)
+ PI.setAborted(&II);
+ }
+ // debug only helper.
+ auto &getVisitedUses() { return VisitedUses; }
+};
+} // namespace
void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
Argument *Arg) {
- bool IsGridConstant = isParamGridConstant(*Arg);
Function *Func = Arg->getParent();
+ bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();
+ bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
+ const DataLayout &DL = Func->getDataLayout();
BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
Type *StructType = Arg->getParamByValType();
assert(StructType && "Missing byval type");
- auto AreSupportedUsers = [&](Value *Start) {
- SmallVector<Value *, 16> ValuesToCheck = {Start};
- auto IsSupportedUse = [IsGridConstant](Value *V) -> bool {
- if (isa<GetElementPtrInst>(V) || isa<BitCastInst>(V) || isa<LoadInst>(V))
- return true;
- // ASC to param space are OK, too -- we'll just strip them.
- if (auto *ASC = dyn_cast<AddrSpaceCastInst>(V)) {
- if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM)
- return true;
- }
- // Simple calls and stores are supported for grid_constants
- // writes to these pointers are undefined behaviour
- if (IsGridConstant &&
- (isa<CallInst>(V) || isa<StoreInst>(V) || isa<PtrToIntInst>(V)))
- return true;
- return false;
- };
-
- while (!ValuesToCheck.empty()) {
- Value *V = ValuesToCheck.pop_back_val();
- if (!IsSupportedUse(V)) {
- LLVM_DEBUG(dbgs() << "Need a "
- << (isParamGridConstant(*Arg) ? "cast " : "copy ")
- << "of " << *Arg << " because of " << *V << "\n");
- (void)Arg;
- return false;
- }
- if (!isa<LoadInst>(V) && !isa<CallInst>(V) && !isa<StoreInst>(V) &&
- !isa<PtrToIntInst>(V))
- llvm::append_range(ValuesToCheck, V->users());
- }
- return true;
- };
-
- if (llvm::all_of(Arg->users(), AreSupportedUsers)) {
+ ArgUseChecker AUC(DL, IsGridConstant);
+ ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);
+ // Easy case, accessing parameter directly is fine.
+ if (!(PI.isEscaped() || PI.isAborted()) && AUC.Conditionals.empty()) {
// Convert all loads and intermediate operations to use parameter AS and
// skip creation of a local copy of the argument.
SmallVector<Use *, 16> UsesToUpdate;
@@ -462,7 +578,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
FirstInst);
for (Use *U : UsesToUpdate)
- convertToParamAS(U, ArgInParamAS, IsGridConstant);
+ convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
const auto *TLI =
@@ -473,13 +589,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
return;
}
- const DataLayout &DL = Func->getDataLayout();
+ // We can't access byval arg directly and need a pointer. on sm_70+ we have
+ // ability to take a pointer to the argument without making a local copy.
+ // However, we're still not allowed to write to it. If the user specified
+ // `__grid_constant__` for the argument, we'll consider escaped pointer as
+ // read-only.
unsigned AS = DL.getAllocaAddrSpace();
- if (isParamGridConstant(*Arg)) {
- // Writes to a grid constant are undefined behaviour. We do not need a
- // temporary copy. When a pointer might have escaped, conservatively replace
- // all of its uses (which might include a device function call) with a cast
- // to the generic address space.
+ if (HasCvtaParam && (!(PI.isEscaped() || PI.isAborted()) || IsGridConstant)) {
+ LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
+ // Replace all argument pointer uses (which might include a device function
+ // call) with a cast to the generic address space using cvta.param
+ // instruction, which avoids a local copy.
IRBuilder<> IRB(&Func->getEntryBlock().front());
// Cast argument to param address space
@@ -500,6 +620,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
// Do not replace Arg in the cast to param space
CastToParam->setOperand(0, Arg);
} else {
+ LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n");
// Otherwise we have to create a temporary copy.
AllocaInst *AllocA =
new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e47050734aae1e..38b5ee63f4b2b0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -93,6 +93,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool hasDotInstructions() const {
return SmVersion >= 61 && PTXVersion >= 50;
}
+ bool hasCvtaParam() const {
+ return SmVersion >= 70 && PTXVersion >= 77;
+ }
unsigned int getFullSmVersion() const { return FullSmVersion; }
unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
// GPUs with "a" suffix have include architecture-accelerated features that
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index f6db9c429dba57..176dfee11cfb09 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -1,18 +1,30 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT
-; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX
+; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
+; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
; PTX-LABEL: grid_const_int(
-; PTX-NOT: ld.u32
-; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0];
-;
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u64 %rd1, [grid_const_int_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_int_param_1];
+; PTX-NEXT: ld.param.u32 %r2, [grid_const_int_param_0];
+; PTX-NEXT: add.s32 %r3, %r2, %r1;
+; PTX-NEXT: st.global.u32 [%rd2], %r3;
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_int(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) {
-; OPT-NOT: alloca
-; OPT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
-;
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT: [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; OPT-NEXT: [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr
+; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
+; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
+; OPT-NEXT: store i32 [[ADD]], ptr [[OUT3]], align 4
+; OPT-NEXT: ret void
%tmp = load i32, ptr %input1, align 4
%add = add i32 %tmp, %input2
store i32 %add, ptr %out
@@ -24,19 +36,29 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
; PTX-LABEL: grid_const_struct(
; PTX: {
-; PTX-NOT: ld.u32
-; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_struct_param_0];
-; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_struct_param_0+4];
-;
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u64 %rd1, [grid_const_struct_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_struct_param_0];
+; PTX-NEXT: ld.param.u32 %r2, [grid_const_struct_param_0+4];
+; PTX-NEXT: add.s32 %r3, %r1, %r2;
+; PTX-NEXT: st.global.u32 [%rd2], %r3;
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_struct(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) {
-; OPT-NOT: alloca
-; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
-; OPT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
-; OPT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
-; OPT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; OPT-NEXT: [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
+; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
+; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
+; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
+; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; OPT-NEXT: store i32 [[ADD]], ptr [[OUT5]], align 4
+; OPT-NEXT: ret void
%gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
%int1 = load i32, ptr %gep1
@@ -49,41 +71,85 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
; PTX-LABEL: grid_const_escape(
; PTX: {
-; PTX-NOT: .local
-; PTX: cvta.param.{{.*}}
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<4>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_escape_param_0;
+; PTX-NEXT: mov.u64 %rd2, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
+; PTX-NEXT: { // callseq 0, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0+0], %rd3;
+; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: call.uni (retval0),
+; PTX-NEXT: escape,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: ld.param.b32 %r1, [retval0+0];
+; PTX-NEXT: } // callseq 0
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) {
-; OPT-NOT: alloca [[STRUCT_S]]
-; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
+; OPT-NEXT: ret void
%call = call i32 @escape(ptr %input)
ret void
}
define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
; PTX-LABEL: multiple_grid_const_escape(
-; PTX: mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0;
-; PTX: mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2;
-; PTX: mov.{{.*}} [[RD3:%.*]], [[RD2]];
-; PTX: mov.{{.*}} [[RD4:%.*]], [[RD1]];
-; PTX: cvta.param.{{.*}} [[RD5:%.*]], [[RD4]];
-; PTX: cvta.param.{{.*}} [[RD6:%.*]], [[RD3]];
-; PTX: {
-; PTX: st.param.b64 [param0+0], [[RD5]];
-; PTX: st.param.b64 [param2+0], [[RD6]];
-;
+; PTX: {
+; PTX-NEXT: .local .align 4 .b8 __local_depot3[4];
+; PTX-NEXT: .reg .b64 %SP;
+; PTX-NEXT: .reg .b64 %SPL;
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<9>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.u64 %SPL, __local_depot3;
+; PTX-NEXT: cvta.local.u64 %SP, %SPL;
+; PTX-NEXT: mov.b64 %rd1, multiple_grid_const_escape_param_0;
+; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_2;
+; PTX-NEXT: mov.u64 %rd3, %rd2;
+; PTX-NEXT: ld.param.u32 %r1, [multiple_grid_const_escape_param_1];
+; PTX-NEXT: cvta.param.u64 %rd4, %rd3;
+; PTX-NEXT: mov.u64 %rd5, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd6, %rd5;
+; PTX-NEXT: add.u64 %rd7, %SP, 0;
+; PTX-NEXT: add.u64 %rd8, %SPL, 0;
+; PTX-NEXT: st.local.u32 [%rd8], %r1;
+; PTX-NEXT: { // callseq 1, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0+0], %rd6;
+; PTX-NEXT: .param .b64 param1;
+; PTX-NEXT: st.param.b64 [param1+0], %rd7;
+; PTX-NEXT: .param .b64 param2;
+; PTX-NEXT: st.param.b64 [param2+0], %rd4;
+; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: call.uni (retval0),
+; PTX-NEXT: escape3,
+; PTX-NEXT: (
+; PTX-NEXT: param0,
+; PTX-NEXT: param1,
+; PTX-NEXT: param2
+; PTX-NEXT: );
+; PTX-NEXT: ld.param.b32 %r2, [retval0+0];
+; PTX-NEXT: } // callseq 1
+; PTX-NEXT: ret;
; OPT-LABEL: define void @multiple_grid_const_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) {
-; OPT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NOT: alloca %struct.s
-; OPT: [[A_ADDR:%.*]] = alloca i32, align 4
-; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; OPT-NEXT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
-;
+; OPT-NEXT: ret void
%a.addr = alloca i32, align 4
store i32 %a, ptr %a.addr, align 4
%call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
@@ -92,40 +158,58 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
; PTX-LABEL: grid_const_memory_escape(
-; PTX-NOT: .local
-; PTX: mov.b64 [[RD1:%.*]], grid_const_memory_escape_param_0;
-; PTX: cvta.param.u64 [[RD3:%.*]], [[RD2:%.*]];
-; PTX: st.global.u64 [[[RD4:%.*]]], [[RD3]];
-;
+; PTX: {
+; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0;
+; PTX-NEXT: ld.param.u64 %rd2, [grid_const_memory_escape_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT: mov.u64 %rd4, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT: st.global.u64 [%rd3], %rd5;
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_memory_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) {
-; OPT-NOT: alloca [[STRUCT_S]]
-; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT: store ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, align 8
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
+; OPT-NEXT: [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR5]], align 8
+; OPT-NEXT: ret void
store ptr %input, ptr %addr, align 8
ret void
}
define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
; PTX-LABEL: grid_const_inlineasm_escape(
-; PTX-NOT .local
-; PTX: add.{{.*}} [[RD2:%.*]], [[RD1:%.*]], 4;
-; PTX: cvta.param.u64 [[RD4:%.*]], [[RD2]]
-; PTX: cvta.param.u64 [[RD3:%.*]], [[RD1]]
-; PTX: add.s64 [[RD5:%.*]], [[RD3]], [[RD4]];
-;
+; PTX: {
+; PTX-NEXT: .reg .b64 %rd<8>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0;
+; PTX-NEXT: ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5;
+; PTX-NEXT: mov.u64 %rd7, %rd4;
+; PTX-NEXT: cvta.param.u64 %rd2, %rd7;
+; PTX-NEXT: add.s64 %rd3, %rd2, 4;
+; PTX-NEXT: // begin inline asm
+; PTX-NEXT: add.s64 %rd1, %rd2, %rd3;
+; PTX-NEXT: // end inline asm
+; PTX-NEXT: st.global.u64 [%rd6], %rd1;
+; PTX-NEXT: ret;
+; PTX-NOT .local
; OPT-LABEL: define void @grid_const_inlineasm_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) {
-; OPT-NOT: alloca [[STRUCT_S]]
-; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[TMPPTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 0
-; OPT: [[TMPPTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 1
-; OPT: [[TMPPTR22_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR22]])
-; OPT: [[TMPPTR13_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR13]])
-; OPT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1)
+; OPT-NEXT: [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr
+; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
+; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
+; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
+; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT5]], align 8
+; OPT-NEXT: ret void
%tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
%1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
@@ -135,24 +219,42 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
; PTX-LABEL: grid_const_partial_escape(
-; PTX-NOT: .local
-; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escape_param_0];
-; PTX: add.{{.*}}
-; PTX: cvta.param.u64 [[RD3:%.*]], {{%.*}}
-; PTX: st.param.{{.*}} [param0+0], [[RD3]]
-; PTX: call
-;
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<5>;
+; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escape_param_0;
+; PTX-NEXT: ld.param.u64 %rd2, [grid_const_partial_escape_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT: mov.u64 %rd4, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT: ld.u32 %r1, [%rd5];
+; PTX-NEXT: add.s32 %r2, %r1, %r1;
+; PTX-NEXT: st.global.u32 [%rd3], %r2;
+; PTX-NEXT: { // callseq 2, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0+0], %rd5;
+; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: call.uni (retval0),
+; PTX-NEXT: escape,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: ld.param.b32 %r3, [retval0+0];
+; PTX-NEXT: } // callseq 2
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_partial_escape(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
-; OPT-NOT: alloca
-; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT1]], align 4
-; OPT: [[TWICE:%.*]] = add i32 [[VAL]], [[VAL]]
-; OPT: store i32 [[TWICE]]
-; OPT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
-; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
-; OPT: ret void
-;
+; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
+; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
+; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
+; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4
+; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]]
+; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT5]], align 4
+; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
+; OPT-NEXT: ret void
%val = load i32, ptr %input
%twice = add i32 %val, %val
store i32 %twice, ptr %output
@@ -163,27 +265,46 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
; PTX-LABEL: grid_const_partial_escapemem(
; PTX: {
-; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escapemem_param_0];
-; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_partial_escapemem_param_0+4];
-; PTX: cvta.param.{{.*}} [[RD5:%.*]], {{%.*}};
-; PTX: st.global.{{.*}} [{{.*}}], [[RD5]];
-; PTX: add.s32 [[R3:%.*]], [[R1]], [[R2]]
-; PTX: st.param.{{.*}} [param0+0], [[RD5]]
-; PTX: escape
+; PTX-NEXT: .reg .b32 %r<6>;
+; PTX-NEXT: .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escapemem_param_0;
+; PTX-NEXT: ld.param.u64 %rd2, [grid_const_partial_escapemem_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT: mov.u64 %rd4, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT: ld.u32 %r1, [%rd5];
+; PTX-NEXT: ld.u32 %r2, [%rd5+4];
+; PTX-NEXT: st.global.u64 [%rd3], %rd5;
+; PTX-NEXT: add.s32 %r3, %r1, %r2;
+; PTX-NEXT: { // callseq 3, 0
+; PTX-NEXT: .param .b64 param0;
+; PTX-NEXT: st.param.b64 [param0+0], %rd5;
+; PTX-NEXT: .param .b32 retval0;
+; PTX-NEXT: call.uni (retval0),
+; PTX-NEXT: escape,
+; PTX-NEXT: (
+; PTX-NEXT: param0
+; PTX-NEXT: );
+; PTX-NEXT: ld.param.b32 %r4, [retval0+0];
+; PTX-NEXT: } // callseq 3
+; PTX-NEXT: st.param.b32 [func_retval0+0], %r3;
+; PTX-NEXT: ret;
; OPT-LABEL: define i32 @grid_const_partial_escapemem(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
-; OPT-NOT: alloca
-; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[PTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 0
-; OPT: [[VAL1:%.*]] = load i32, ptr addrspace(101) [[PTR13]], align 4
-; OPT: [[PTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 1
-; OPT: [[VAL2:%.*]] = load i32, ptr addrspace(101) [[PTR22]], align 4
-; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
-; OPT: store ptr [[INPUT1]]
-; OPT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
-; OPT: [[PTR1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[PTR13]])
-; OPT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
+; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
+; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
+; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4
+; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
+; OPT-NEXT: [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4
+; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8
+; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
+; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
+; OPT-NEXT: ret i32 [[ADD]]
%ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%val1 = load i32, ptr %ptr1
%ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
@@ -194,29 +315,48 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
ret i32 %add
}
-define void @grid_const_phi_escape(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
-; PTX-LABEL: grid_const_phi_escape(
-; PTX: cvta.param.{{.*}} [[RD1:%.*]], {{.*}}
-; PTX: @[[P1:%.*]] bra $L__BB[[TARGET_LABEL:[_0-9]+]];
-; PTX: $L__BB[[TARGET_LABEL]]:
-; PTX: ld.{{.*}} [[R1:%.*]], [[[RD1]]];
-;
-; OPT-LABEL: define void @grid_const_phi_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr {{%.*}}) {
-; OPT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
-; OPT: br i1 {{.*}}, label %[[FIRST:.*]], label %[[SECOND:.*]]
+define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
+; PTX-LABEL: grid_const_phi(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<9>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd5, grid_const_phi_param_0;
+; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_param_1];
+; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6;
+; PTX-NEXT: mov.u64 %rd7, %rd5;
+; PTX-NEXT: cvta.param.u64 %rd8, %rd7;
+; PTX-NEXT: ld.global.u32 %r1, [%rd1];
+; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT: @%p1 bra $L__BB8_2;
+; PTX-NEXT: // %bb.1: // %second
+; PTX-NEXT: add.s64 %rd8, %rd8, 4;
+; PTX-NEXT: $L__BB8_2: // %merge
+; PTX-NEXT: ld.u32 %r2, [%rd8];
+; PTX-NEXT: st.global.u32 [%rd1], %r2;
+; PTX-NEXT: ret;
+; OPT-LABEL: define void @grid_const_phi(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; OPT: [[FIRST]]:
-; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
-; OPT: br label %[[MERGE:.*]]
+; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; OPT-NEXT: br label %[[MERGE:.*]]
; OPT: [[SECOND]]:
-; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
-; OPT: br label %[[MERGE]]
+; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
+; OPT-NEXT: br label %[[MERGE]]
; OPT: [[MERGE]]:
-; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
-; OPT-NOT: load i32, ptr addrspace(101)
-; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-;
+; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT: ret void
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
@@ -235,32 +375,53 @@ merge:
}
; NOTE: %input2 is *not* grid_constant
-define void @grid_const_phi_escape2(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
-; PTX-LABEL: grid_const_phi_escape2(
-; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_phi_escape2_param_1+4];
-; PTX: @[[P1:%.*]] bra $L__BB[[LABEL:[_0-9]+]];
-; PTX: cvta.param.u64 [[RD1:%.*]], [[RD2:%.*]];
-; PTX: ld.u32 [[R1]], [[[RD1]]];
-; PTX: $L__BB[[LABEL]]:
-; PTX: st.global.u32 [[[RD3:%.*]]], [[R1]]
-; OPT-LABEL: define void @grid_const_phi_escape2(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr {{%.*}}) {
-; OPT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
-; OPT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; OPT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
-; OPT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
-; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
-; OPT: br i1 [[LESS:%.*]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_phi_ngc(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<12>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd6, grid_const_phi_ngc_param_0;
+; PTX-NEXT: ld.param.u64 %rd7, [grid_const_phi_ngc_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd1, %rd7;
+; PTX-NEXT: mov.u64 %rd10, %rd6;
+; PTX-NEXT: cvta.param.u64 %rd11, %rd10;
+; PTX-NEXT: ld.global.u32 %r1, [%rd1];
+; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT: @%p1 bra $L__BB9_2;
+; PTX-NEXT: // %bb.1: // %second
+; PTX-NEXT: mov.b64 %rd8, grid_const_phi_ngc_param_1;
+; PTX-NEXT: mov.u64 %rd9, %rd8;
+; PTX-NEXT: cvta.param.u64 %rd2, %rd9;
+; PTX-NEXT: add.s64 %rd11, %rd2, 4;
+; PTX-NEXT: $L__BB9_2: // %merge
+; PTX-NEXT: ld.u32 %r2, [%rd11];
+; PTX-NEXT: st.global.u32 [%rd1], %r2;
+; PTX-NEXT: ret;
+; OPT-LABEL: define void @grid_const_phi_ngc(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; OPT: [[FIRST]]:
-; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
-; OPT: br label %[[MERGE:.*]]
+; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; OPT-NEXT: br label %[[MERGE:.*]]
; OPT: [[SECOND]]:
-; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
-; OPT: br label %[[MERGE]]
+; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1
+; OPT-NEXT: br label %[[MERGE]]
; OPT: [[MERGE]]:
-; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
-;
+; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT: ret void
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
br i1 %less, label %first, label %second
@@ -278,22 +439,42 @@ merge:
}
; NOTE: %input2 is *not* grid_constant
-define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
-; PTX-LABEL: grid_const_select_escape(
-; PTX: cvta.param.{{.*}} [[RD2:%.*]], [[RD1:%.*]]
-; PTX: setp.lt.{{.*}} [[P1:%.*]], {{%.*}}, 0
-; PTX: add.{{.*}} [[RD3:%.*]], %SP, 0;
-; PTX: selp.{{.*}} [[RD4:%.*]], [[RD2]], [[RD3]], [[P1]];
-; PTX: ld.u32 {{%.*}}, [[[RD4]]];
-; OPT-LABEL: define void @grid_const_select_escape(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) {
-; OPT: [[INPUT24:%.*]] = alloca i32, align 4
-; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
-; OPT: load i32, ptr [[INOUT]]
-; OPT: [[PTRNEW:%.*]] = select i1 [[LESS:%.*]], ptr [[INPUT11]], ptr [[INPUT24]]
-; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-;
+define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_select(
+; PTX: {
+; PTX-NEXT: .reg .pred %p<2>;
+; PTX-NEXT: .reg .b32 %r<3>;
+; PTX-NEXT: .reg .b64 %rd<10>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0;
+; PTX-NEXT: ld.param.u64 %rd2, [grid_const_select_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1;
+; PTX-NEXT: mov.u64 %rd5, %rd4;
+; PTX-NEXT: cvta.param.u64 %rd6, %rd5;
+; PTX-NEXT: mov.u64 %rd7, %rd1;
+; PTX-NEXT: cvta.param.u64 %rd8, %rd7;
+; PTX-NEXT: ld.global.u32 %r1, [%rd3];
+; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1;
+; PTX-NEXT: ld.u32 %r2, [%rd9];
+; PTX-NEXT: st.global.u32 [%rd3], %r2;
+; PTX-NEXT: ret;
+; OPT-LABEL: define void @grid_const_select(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
+; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT: ret void
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
%ptrnew = select i1 %less, ptr %input1, ptr %input2
@@ -304,16 +485,27 @@ define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(
define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
; PTX-LABEL: grid_const_ptrtoint(
-; PTX-NOT: .local
-; PTX: ld.param.{{.*}} {{%.*}}, [grid_const_ptrtoint_param_0];
-; PTX: cvta.param.u64 [[RD1:%.*]], {{%.*}}
-; PTX: cvt.u32.u64 {{%.*}}, [[RD1]]
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<4>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0;
+; PTX-NEXT: mov.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_ptrtoint_param_0];
+; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
+; PTX-NEXT: cvt.u32.u64 %r2, %rd3;
+; PTX-NEXT: add.s32 %r3, %r1, %r2;
+; PTX-NEXT: st.param.b32 [func_retval0+0], %r3;
+; PTX-NEXT: ret;
; OPT-LABEL: define i32 @grid_const_ptrtoint(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) {
-; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT2]]
-; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
-; OPT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
+; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
+; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
+; OPT-NEXT: ret i32 [[KEEPALIVE]]
%val = load i32, ptr %input
%ptrval = ptrtoint ptr %input to i32
%keepalive = add i32 %val, %ptrval
@@ -352,13 +544,13 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
!14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15}
!15 = !{i32 1}
-!16 = !{ptr @grid_const_phi_escape, !"kernel", i32 1, !"grid_constant", !17}
+!16 = !{ptr @grid_const_phi, !"kernel", i32 1, !"grid_constant", !17}
!17 = !{i32 1}
-!18 = !{ptr @grid_const_phi_escape2, !"kernel", i32 1, !"grid_constant", !19}
+!18 = !{ptr @grid_const_phi_ngc, !"kernel", i32 1, !"grid_constant", !19}
!19 = !{i32 1}
-!20 = !{ptr @grid_const_select_escape, !"kernel", i32 1, !"grid_constant", !21}
+!20 = !{ptr @grid_const_select, !"kernel", i32 1, !"grid_constant", !21}
!21 = !{i32 1}
!22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23}
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index f041f202777f61..7aec67a2ea628c 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -1,166 +1,300 @@
-; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32
-; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-
-%struct.ham = type { [4 x i32] }
-
-; // Verify that load with static offset into parameter is done directly.
-; CHECK-LABEL: .visible .entry static_offset
-; CHECK-NOT: .local
-; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1
-; CHECK64: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]]
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-;
-; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: mov.b32 %[[param_addr:r[0-9]+]], {{.*}}_param_1
-; CHECK32: mov.u32 %[[param_addr1:r[0-9]+]], %[[param_addr]]
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-;
-; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_addr1]]+12];
-; CHECK: st.global.u32 [[[result_addr_g]]], [[value]];
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @static_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
-bb:
- %tmp = icmp eq i32 %arg2, 3
- br i1 %tmp, label %bb3, label %bb6
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple nvptx -mcpu=sm_70 -nvptx-lower-args -S | FileCheck %s --check-prefixes=CHECK,CHECK32
+source_filename = "<stdin>"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.S = type { i32, i32 }
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+declare dso_local void @_Z6escapePv(ptr noundef) local_unnamed_addr #0
-bb3: ; preds = %bb
- %tmp4 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 3
- %tmp5 = load i32, ptr %tmp4, align 4
- store i32 %tmp5, ptr %arg, align 4
- br label %bb6
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+declare dso_local void @_Z6escapei(i32 noundef) local_unnamed_addr #0
-bb6: ; preds = %bb3, %bb
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
+; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %i = load i32, ptr %s, align 4
+ store i32 %i, ptr %out, align 4
ret void
}
-; // Verify that load with dynamic offset into parameter is also done directly.
-; CHECK-LABEL: .visible .entry dynamic_offset
-; CHECK-NOT: .local
-; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1
-; CHECK64: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]]
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-; CHECK64: add.s64 %[[param_w_offset:rd[0-9]+]], %[[param_addr1]],
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only_gep(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT: ret void
;
-; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: mov.b32 %[[param_addr:r[0-9]+]], {{.*}}_param_1
-; CHECK32: mov.u32 %[[param_addr1:r[0-9]+]], %[[param_addr]]
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-; CHECK32: add.s32 %[[param_w_offset:r[0-9]+]], %[[param_addr1]],
+entry:
+ %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+ %i = load i32, ptr %b, align 4
+ store i32 %i, ptr %out, align 4
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only_gep_asc(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT: ret void
;
-; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_w_offset]]];
-; CHECK: st.global.u32 [[[result_addr_g]]], [[value]];
+entry:
+ %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+ %asc = addrspacecast ptr %b to ptr addrspace(101)
+ %i = load i32, ptr addrspace(101) %asc, align 4
+ store i32 %i, ptr %out, align 4
+ ret void
+}
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
-bb:
- %tmp = sext i32 %arg2 to i64
- %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
- %tmp4 = load i32, ptr %tmp3, align 4
- store i32 %tmp4, ptr %arg, align 4
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only_gep_asc0(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; CHECK-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; CHECK-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
+; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4
+; CHECK-NEXT: store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+ %asc = addrspacecast ptr %b to ptr addrspace(101)
+ %asc0 = addrspacecast ptr addrspace(101) %asc to ptr
+ %i = load i32, ptr %asc0, align 4
+ store i32 %i, ptr %out, align 4
ret void
}
-; Same as above, but with a bitcast present in the chain
-; CHECK-LABEL:.visible .entry gep_bitcast
-; CHECK-NOT: .local
-; CHECK64-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_param_0]
-; CHECK64-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_param_1
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr(
+; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) #[[ATTR0]]
+; CHECK-NEXT: ret void
;
-; CHECK32-DAG: ld.param.u32 [[out:%r[0-9]+]], [gep_bitcast_param_0]
-; CHECK32-DAG: mov.b32 {{%r[0-9]+}}, gep_bitcast_param_1
+entry:
+ call void @_Z6escapePv(ptr noundef nonnull %s) #0
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr_gep(
+; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; CHECK-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR0]]
+; CHECK-NEXT: ret void
;
-; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_param_2]
-; CHECK64: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}]
-; CHECK64: st.global.u8 [{{%rd[0-9]+}}], [[value]];
-; CHECK32: ld.param.u8 [[value:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]];
+entry:
+ %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+ call void @_Z6escapePv(ptr noundef nonnull %b) #0
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr_store(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64
+; CHECK-NEXT: store i64 [[I]], ptr [[OUT2]], align 8
+; CHECK-NEXT: ret void
;
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @gep_bitcast(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
-bb:
- %n64 = sext i32 %n to i64
- %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
- %load = load i8, ptr %gep, align 4
- store i8 %load, ptr %out, align 4
+entry:
+ %i = ptrtoint ptr %s to i64
+ store i64 %i, ptr %out, align 8
ret void
}
-; Same as above, but with an ASC(101) present in the chain
-; CHECK-LABEL:.visible .entry gep_bitcast_asc
-; CHECK-NOT: .local
-; CHECK64-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_asc_param_0]
-; CHECK64-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_asc_param_1
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr_gep_store(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; CHECK-NEXT: [[I:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT: store i64 [[I]], ptr [[OUT2]], align 8
+; CHECK-NEXT: ret void
;
-; CHECK32-DAG: ld.param.u32 [[out:%r[0-9]+]], [gep_bitcast_asc_param_0]
-; CHECK32-DAG: mov.b32 {{%r[0-9]+}}, gep_bitcast_asc_param_1
+entry:
+ %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+ %i = ptrtoint ptr %b to i64
+ store i64 %i, ptr %out, align 8
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_math_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_math_store(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64
+; CHECK-NEXT: [[ADD:%.*]] = or disjoint i64 [[I]], 1
+; CHECK-NEXT: store i64 [[ADD]], ptr [[OUT2]], align 8
+; CHECK-NEXT: ret void
;
-; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_asc_param_2]
-; CHECK64: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}]
-; CHECK64: st.global.u8 [{{%rd[0-9]+}}], [[value]];
-; CHECK32: ld.param.u8 [[value:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]];
+entry:
+ %i = ptrtoint ptr %s to i64
+ %add = or disjoint i64 %i, 1
+ store i64 %add, ptr %out, align 8
+ ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @memcpy_from_param(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S4]], i64 16, i1 true)
+; CHECK-NEXT: ret void
;
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @gep_bitcast_asc(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
-bb:
- %n64 = sext i32 %n to i64
- %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
- %asc = addrspacecast ptr %gep to ptr addrspace(101)
- %load = load i8, ptr addrspace(101) %asc, align 4
- store i8 %load, ptr %out, align 4
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
ret void
}
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @memcpy_to_param(
+; CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
+; CHECK-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
+; CHECK-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
+; CHECK-NEXT: ret void
+;
+entry:
+ tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true)
+ ret void
+}
-; Verify that if the pointer escapes, then we do fall back onto using a temp copy.
-; CHECK-LABEL: .visible .entry pointer_escapes
-; CHECK: .local .align 4 .b8 __local_depot{{.*}}
-; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: add.u64 %[[copy_addr:rd[0-9]+]], %SPL, 0;
-; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: add.u32 %[[copy_addr:r[0-9]+]], %SPL, 0;
-; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+12];
-; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+8];
-; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+4];
-; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1];
-; CHECK-DAG: st.local.u32 [%[[copy_addr]]+12],
-; CHECK-DAG: st.local.u32 [%[[copy_addr]]+8],
-; CHECK-DAG: st.local.u32 [%[[copy_addr]]+4],
-; CHECK-DAG: st.local.u32 [%[[copy_addr]]],
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-; CHECK64: add.s64 %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]],
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-; CHECK32: add.s32 %[[copy_w_offset:r[0-9]+]], %[[copy_addr]],
-; CHECK: ld.local.u32 [[value:%r[0-9]+]], [%[[copy_w_offset]]];
-; CHECK: st.global.u32 [[[result_addr_g]]], [[value]];
-
-; Function Attrs: convergent norecurse nounwind mustprogress
-define dso_local void @pointer_escapes(ptr nocapture %arg, ptr byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @copy_on_store(
+; CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
+; CHECK-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
+; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4
+; CHECK-NEXT: store i32 [[I]], ptr [[S3]], align 4
+; CHECK-NEXT: ret void
+;
bb:
- %tmp = sext i32 %arg2 to i64
- %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
- %tmp4 = load i32, ptr %tmp3, align 4
- store i32 %tmp4, ptr %arg, align 4
- %tmp5 = call ptr @escape(ptr nonnull %tmp3) #3
+ %i = load i32, ptr %in, align 4
+ store i32 %i, ptr %s, align 4
ret void
}
-; Function Attrs: convergent nounwind
-declare dso_local ptr @escape(ptr) local_unnamed_addr
-
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !6, !7}
+!llvm.module.flags = !{!0, !1, !2, !3}
+!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!llvm.ident = !{!16, !17}
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]}
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]}
!1 = !{i32 1, !"wchar_size", i32 4}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{ptr @static_offset, !"kernel", i32 1}
-!4 = !{ptr @dynamic_offset, !"kernel", i32 1}
-!5 = !{ptr @pointer_escapes, !"kernel", i32 1}
-!6 = !{ptr @gep_bitcast, !"kernel", i32 1}
-!7 = !{ptr @gep_bitcast_asc, !"kernel", i32 1}
+!3 = !{i32 7, !"frame-pointer", i32 2}
+!4 = !{ptr @read_only, !"kernel", i32 1}
+!5 = !{ptr @escape_ptr, !"kernel", i32 1}
+!6 = !{ptr @escape_ptr_gep, !"kernel", i32 1}
+!7 = !{ptr @escape_ptr_store, !"kernel", i32 1}
+!8 = !{ptr @escape_ptr_gep_store, !"kernel", i32 1}
+!9 = !{ptr @escape_math_store, !"kernel", i32 1}
+!10 = !{ptr @memcpy_from_param, !"kernel", i32 1}
+!11 = !{ptr @memcpy_to_param, !"kernel", i32 1}
+!12 = !{ptr @copy_on_store, !"kernel", i32 1}
+!13 = !{ptr @read_only_gep, !"kernel", i32 1}
+!14 = !{ptr @read_only_gep_asc, !"kernel", i32 1}
+!15 = !{ptr @read_only_gep_asc0, !"kernel", i32 1}
+!16 = !{!"clang version 20.0.0git"}
+!17 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK32: {{.*}}
>From d207ec4aebfff8b5555736cfdf138bf57c1f8edd Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Wed, 28 Aug 2024 11:47:05 -0700
Subject: [PATCH 3/3] fixups
---
llvm/include/llvm/Analysis/PtrUseVisitor.h | 2 +-
llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 2 +-
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 4 +---
3 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index 539d302bb70a1b..7ae03b4a7716a1 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -208,7 +208,7 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
/// Recursively visit the uses of the given pointer.
/// \returns An info struct about the pointer. See \c PtrInfo for details.
- /// We may also need to process Argument pointers, so the input uses is
+ /// We may also need to process Argument pointers, so the input uses is
/// a common Value type.
PtrInfo visitPtr(Value &I) {
// This must be a pointer type. Get an integer type suitable to hold
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 243f39d8a16719..a79dd23abeec62 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -507,7 +507,7 @@ struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
// Storing the pointer escapes it.
if (U->get() == SI.getValueOperand())
return PI.setEscapedAndAborted(&SI);
- // Writes to the pointer are UB w/ __gid_constant__, but do not force a
+ // Writes to the pointer are UB w/ __grid_constant__, but do not force a
// copy.
if (!IsGridConstant)
return PI.setAborted(&SI);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 38b5ee63f4b2b0..ed5d120902d2a0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -93,9 +93,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool hasDotInstructions() const {
return SmVersion >= 61 && PTXVersion >= 50;
}
- bool hasCvtaParam() const {
- return SmVersion >= 70 && PTXVersion >= 77;
- }
+ bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
unsigned int getFullSmVersion() const { return FullSmVersion; }
unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
// GPUs with "a" suffix have include architecture-accelerated features that
More information about the llvm-commits
mailing list