[llvm] [NVPTX] Improve copy avoidance during lowering. (PR #106423)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 28 10:43:48 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-analysis
Author: Artem Belevich (Artem-B)
<details>
<summary>Changes</summary>
On newer GPUs, where `cvta.param` instruction is available we can avoid making
byval arguments when their pointers are used in a few more cases, even
when `__grid_constant__` is not specified.
- phi
- select
- memcpy from the parameter.
Switched pointer traversal from a DIY implementation to PtrUseVisitor.
Note: includes PtrUseVisitor patch #<!-- -->106308 which will land separately.
---
Patch is 77.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106423.diff
6 Files Affected:
- (modified) llvm/include/llvm/Analysis/PtrUseVisitor.h (+5-2)
- (modified) llvm/lib/Analysis/PtrUseVisitor.cpp (+1-1)
- (modified) llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp (+194-73)
- (modified) llvm/lib/Target/NVPTX/NVPTXSubtarget.h (+3)
- (modified) llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll (+374-182)
- (modified) llvm/test/CodeGen/NVPTX/lower-byval-args.ll (+268-134)
``````````diff
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index b6cc14d2077af0..539d302bb70a1b 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -157,7 +157,7 @@ class PtrUseVisitorBase {
///
/// This will visit the users with the same offset of the current visit
/// (including an unknown offset if that is the current state).
- void enqueueUsers(Instruction &I);
+ void enqueueUsers(Value &I);
/// Walk the operands of a GEP and adjust the offset as appropriate.
///
@@ -208,11 +208,14 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
/// Recursively visit the uses of the given pointer.
/// \returns An info struct about the pointer. See \c PtrInfo for details.
- PtrInfo visitPtr(Instruction &I) {
+ /// We may also need to process Argument pointers, so the input uses is
+ /// a common Value type.
+ PtrInfo visitPtr(Value &I) {
// This must be a pointer type. Get an integer type suitable to hold
// offsets on this pointer.
// FIXME: Support a vector of pointers.
assert(I.getType()->isPointerTy());
+ assert(isa<Instruction>(I) || isa<Argument>(I));
IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(I.getType()));
IsOffsetKnown = true;
Offset = APInt(IntIdxTy->getBitWidth(), 0);
diff --git a/llvm/lib/Analysis/PtrUseVisitor.cpp b/llvm/lib/Analysis/PtrUseVisitor.cpp
index 49304818d7efed..9c79546f491eff 100644
--- a/llvm/lib/Analysis/PtrUseVisitor.cpp
+++ b/llvm/lib/Analysis/PtrUseVisitor.cpp
@@ -17,7 +17,7 @@
using namespace llvm;
-void detail::PtrUseVisitorBase::enqueueUsers(Instruction &I) {
+void detail::PtrUseVisitorBase::enqueueUsers(Value &I) {
for (Use &U : I.uses()) {
if (VisitedUses.insert(&U).second) {
UseToVisit NewU = {
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 1205ad4c6b008f..243f39d8a16719 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -79,15 +79,15 @@
//
// define void @foo({i32*, i32*}* byval %input) {
// %b_param = addrspacecat ptr %input to ptr addrspace(101)
-// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1
-// %b = load ptr, ptr addrspace(101) %b_ptr
-// %b_global = addrspacecast ptr %b to ptr addrspace(1)
-// ; use %b_generic
+// %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0,
+// i32 1 %b = load ptr, ptr addrspace(101) %b_ptr %b_global = addrspacecast
+// ptr %b to ptr addrspace(1) ; use %b_generic
// }
//
-// Create a local copy of kernel byval parameters used in a way that *might* mutate
-// the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters
-// are undefined behaviour, and don't require local copies.
+// Create a local copy of kernel byval parameters used in a way that *might*
+// mutate the parameter, by storing it in an alloca. Mutations to
+// "grid_constant" parameters are undefined behaviour, and don't require
+// local copies.
//
// define void @foo(ptr byval(%struct.s) align 4 %input) {
// store i32 42, ptr %input
@@ -124,11 +124,11 @@
//
// define void @foo(ptr byval(%struct.s) %input) {
// %input1 = addrspacecast ptr %input to ptr addrspace(101)
-// ; the following intrinsic converts pointer to generic. We don't use an addrspacecast
-// ; to prevent generic -> param -> generic from getting cancelled out
-// %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1)
-// %call = call i32 @escape(ptr %input1.gen)
-// ret void
+// ; the following intrinsic converts pointer to generic. We don't use an
+// addrspacecast ; to prevent generic -> param -> generic from getting
+// cancelled out %input1.gen = call ptr
+// @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1) %call =
+// call i32 @escape(ptr %input1.gen) ret void
// }
//
// TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
@@ -139,16 +139,21 @@
#include "NVPTX.h"
#include "NVPTXTargetMachine.h"
#include "NVPTXUtilities.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include <numeric>
#include <queue>
@@ -217,7 +222,8 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args",
// pointer in parameter AS.
// For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to
// generic using cvta.param.
-static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
+static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam,
+ bool IsGridConstant) {
Instruction *I = dyn_cast<Instruction>(OldUse->getUser());
assert(I && "OldUse must be in an instruction");
struct IP {
@@ -228,7 +234,8 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}};
SmallVector<Instruction *> InstructionsToDelete;
- auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * {
+ auto CloneInstInParamAS = [HasCvtaParam,
+ IsGridConstant](const IP &I) -> Value * {
if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {
LI->setOperand(0, I.NewParam);
return LI;
@@ -252,8 +259,25 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
// Just pass through the argument, the old ASC is no longer needed.
return I.NewParam;
}
+ if (auto *MI = dyn_cast<MemTransferInst>(I.OldInstruction)) {
+ if (MI->getRawSource() == I.OldUse->get()) {
+ // convert to memcpy/memmove from param space.
+ IRBuilder<> Builder(I.OldInstruction);
+ Intrinsic::ID ID = MI->getIntrinsicID();
+
+ CallInst *B = Builder.CreateMemTransferInst(
+ ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam,
+ MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
+ for (unsigned I : {0, 1})
+ if (uint64_t Bytes = MI->getParamDereferenceableBytes(I))
+ B->addDereferenceableParamAttr(I, Bytes);
+ return B;
+ }
+ // We may be able to handle other cases if the argument is
+ // __grid_constant__
+ }
- if (GridConstant) {
+ if (HasCvtaParam) {
auto GetParamAddrCastToGeneric =
[](Value *Addr, Instruction *OriginalUser) -> Value * {
PointerType *ReturnTy =
@@ -269,24 +293,44 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
OriginalUser->getIterator());
return CvtToGenCall;
};
-
- if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
- I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI));
- return CI;
+ auto *ParamInGenericAS =
+ GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction);
+
+ // phi/select could use generic arg pointers w/o __grid_constant__
+ if (auto *PHI = dyn_cast<PHINode>(I.OldInstruction)) {
+ for (auto [Idx, V] : enumerate(PHI->incoming_values())) {
+ if (V.get() == I.OldUse->get())
+ PHI->setIncomingValue(Idx, ParamInGenericAS);
+ }
}
- if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
- // byval address is being stored, cast it to generic
- if (SI->getValueOperand() == I.OldUse->get())
- SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI));
- return SI;
+ if (auto *SI = dyn_cast<SelectInst>(I.OldInstruction)) {
+ if (SI->getTrueValue() == I.OldUse->get())
+ SI->setTrueValue(ParamInGenericAS);
+ if (SI->getFalseValue() == I.OldUse->get())
+ SI->setFalseValue(ParamInGenericAS);
}
- if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
- if (PI->getPointerOperand() == I.OldUse->get())
- PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI));
- return PI;
+
+ // Escapes or writes can only use generic param pointers if
+ // __grid_constant__ is in effect.
+ if (IsGridConstant) {
+ if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
+ I.OldUse->set(ParamInGenericAS);
+ return CI;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
+ // byval address is being stored, cast it to generic
+ if (SI->getValueOperand() == I.OldUse->get())
+ SI->setOperand(0, ParamInGenericAS);
+ return SI;
+ }
+ if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
+ if (PI->getPointerOperand() == I.OldUse->get())
+ PI->setOperand(0, ParamInGenericAS);
+ return PI;
+ }
+ // TODO: iIf we allow stores, we should allow memcpy/memset to
+ // parameter, too.
}
- llvm_unreachable(
- "Instruction unsupported even for grid_constant argument");
}
llvm_unreachable("Unsupported instruction");
@@ -409,49 +453,121 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
}
}
+namespace {
+struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
+ using Base = PtrUseVisitor<ArgUseChecker>;
+
+ bool IsGridConstant;
+ SmallPtrSet<Value *, 16> AllArgUsers;
+ // Set of phi/select instructions using the Arg
+ SmallPtrSet<Instruction *, 4> Conditionals;
+
+ ArgUseChecker(const DataLayout &DL, bool IsGridConstant)
+ : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {}
+
+ PtrInfo visitArgPtr(Argument &A) {
+ assert(A.getType()->isPointerTy());
+ IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(A.getType()));
+ IsOffsetKnown = false;
+ Offset = APInt(IntIdxTy->getBitWidth(), 0);
+ PI.reset();
+ AllArgUsers.clear();
+ Conditionals.clear();
+
+ LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n");
+ // Enqueue the uses of this pointer.
+ enqueueUsers(A);
+ AllArgUsers.insert(&A);
+
+ // Visit all the uses off the worklist until it is empty.
+ // Note that unlike PtrUseVisitor we're intentionally do not track offset.
+ // We're only interested in how we use the pointer.
+ while (!(Worklist.empty() || PI.isAborted())) {
+ UseToVisit ToVisit = Worklist.pop_back_val();
+ U = ToVisit.UseAndIsOffsetKnown.getPointer();
+ Instruction *I = cast<Instruction>(U->getUser());
+ AllArgUsers.insert(I);
+ if (isa<PHINode>(I) || isa<SelectInst>(I))
+ Conditionals.insert(I);
+ LLVM_DEBUG(dbgs() << "Processing " << *I << "\n");
+ Base::visit(I);
+ }
+ if (PI.isEscaped())
+ LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst()
+ << "\n");
+ else if (PI.isAborted())
+ LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst()
+ << "\n");
+ LLVM_DEBUG(dbgs() << "Traversed " << AllArgUsers.size() << " with "
+ << Conditionals.size() << " conditionals\n");
+ return PI;
+ }
+
+ void visitStoreInst(StoreInst &SI) {
+ // Storing the pointer escapes it.
+ if (U->get() == SI.getValueOperand())
+ return PI.setEscapedAndAborted(&SI);
+ // Writes to the pointer are UB w/ __gid_constant__, but do not force a
+ // copy.
+ if (!IsGridConstant)
+ return PI.setAborted(&SI);
+ }
+
+ void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+ // ASC to param space are no-ops and do not need a copy
+ if (ASC.getDestAddressSpace() != ADDRESS_SPACE_PARAM)
+ return PI.setEscapedAndAborted(&ASC);
+ Base::visitAddrSpaceCastInst(ASC);
+ }
+
+ void visitPtrToIntInst(PtrToIntInst &I) {
+ if (IsGridConstant)
+ return;
+ Base::visitPtrToIntInst(I);
+ }
+ void visitPHINodeOrSelectInst(Instruction &I) {
+ assert(isa<PHINode>(I) || isa<SelectInst>(I));
+ }
+ // PHI and select just pass through the pointers.
+ void visitPHINode(PHINode &PN) { enqueueUsers(PN); }
+ void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); }
+
+ void visitMemTransferInst(MemTransferInst &II) {
+ if (*U == II.getRawDest() && !IsGridConstant)
+ PI.setAborted(&II);
+
+ // TODO: memcpy from arg is OK as it can get unrolled into ld.param.
+ // However, memcpys are currently expected to be unrolled before we
+ // get here, so we never see them in practice, and we do not currently
+ // handle them when we convert IR to access param space directly. So,
+ // we'll mark it as an escape for now. It would still force a copy on
+ // pre-sm_70 GPUs where we can't take address of a parameter w/o a copy.
+ //
+ // PI.setEscaped(&II);
+ }
+
+ void visitMemSetInst(MemSetInst &II) {
+ if (*U == II.getRawDest() && !IsGridConstant)
+ PI.setAborted(&II);
+ }
+ // debug only helper.
+ auto &getVisitedUses() { return VisitedUses; }
+};
+} // namespace
void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
Argument *Arg) {
- bool IsGridConstant = isParamGridConstant(*Arg);
Function *Func = Arg->getParent();
+ bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();
+ bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
+ const DataLayout &DL = Func->getDataLayout();
BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
Type *StructType = Arg->getParamByValType();
assert(StructType && "Missing byval type");
- auto AreSupportedUsers = [&](Value *Start) {
- SmallVector<Value *, 16> ValuesToCheck = {Start};
- auto IsSupportedUse = [IsGridConstant](Value *V) -> bool {
- if (isa<GetElementPtrInst>(V) || isa<BitCastInst>(V) || isa<LoadInst>(V))
- return true;
- // ASC to param space are OK, too -- we'll just strip them.
- if (auto *ASC = dyn_cast<AddrSpaceCastInst>(V)) {
- if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM)
- return true;
- }
- // Simple calls and stores are supported for grid_constants
- // writes to these pointers are undefined behaviour
- if (IsGridConstant &&
- (isa<CallInst>(V) || isa<StoreInst>(V) || isa<PtrToIntInst>(V)))
- return true;
- return false;
- };
-
- while (!ValuesToCheck.empty()) {
- Value *V = ValuesToCheck.pop_back_val();
- if (!IsSupportedUse(V)) {
- LLVM_DEBUG(dbgs() << "Need a "
- << (isParamGridConstant(*Arg) ? "cast " : "copy ")
- << "of " << *Arg << " because of " << *V << "\n");
- (void)Arg;
- return false;
- }
- if (!isa<LoadInst>(V) && !isa<CallInst>(V) && !isa<StoreInst>(V) &&
- !isa<PtrToIntInst>(V))
- llvm::append_range(ValuesToCheck, V->users());
- }
- return true;
- };
-
- if (llvm::all_of(Arg->users(), AreSupportedUsers)) {
+ ArgUseChecker AUC(DL, IsGridConstant);
+ ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);
+ // Easy case, accessing parameter directly is fine.
+ if (!(PI.isEscaped() || PI.isAborted()) && AUC.Conditionals.empty()) {
// Convert all loads and intermediate operations to use parameter AS and
// skip creation of a local copy of the argument.
SmallVector<Use *, 16> UsesToUpdate;
@@ -462,7 +578,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
FirstInst);
for (Use *U : UsesToUpdate)
- convertToParamAS(U, ArgInParamAS, IsGridConstant);
+ convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
const auto *TLI =
@@ -473,13 +589,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
return;
}
- const DataLayout &DL = Func->getDataLayout();
+ // We can't access byval arg directly and need a pointer. on sm_70+ we have
+ // ability to take a pointer to the argument without making a local copy.
+ // However, we're still not allowed to write to it. If the user specified
+ // `__grid_constant__` for the argument, we'll consider escaped pointer as
+ // read-only.
unsigned AS = DL.getAllocaAddrSpace();
- if (isParamGridConstant(*Arg)) {
- // Writes to a grid constant are undefined behaviour. We do not need a
- // temporary copy. When a pointer might have escaped, conservatively replace
- // all of its uses (which might include a device function call) with a cast
- // to the generic address space.
+ if (HasCvtaParam && (!(PI.isEscaped() || PI.isAborted()) || IsGridConstant)) {
+ LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
+ // Replace all argument pointer uses (which might include a device function
+ // call) with a cast to the generic address space using cvta.param
+ // instruction, which avoids a local copy.
IRBuilder<> IRB(&Func->getEntryBlock().front());
// Cast argument to param address space
@@ -500,6 +620,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
// Do not replace Arg in the cast to param space
CastToParam->setOperand(0, Arg);
} else {
+ LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n");
// Otherwise we have to create a temporary copy.
AllocaInst *AllocA =
new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e47050734aae1e..38b5ee63f4b2b0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -93,6 +93,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
bool hasDotInstructions() const {
return SmVersion >= 61 && PTXVersion >= 50;
}
+ bool hasCvtaParam() const {
+ return SmVersion >= 70 && PTXVersion >= 77;
+ }
unsigned int getFullSmVersion() const { return FullSmVersion; }
unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
// GPUs with "a" suffix have include architecture-accelerated features that
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index f6db9c429dba57..176dfee11cfb09 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -1,18 +1,30 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT
-; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX
+; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
+; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
; PTX-LABEL: grid_const_int(
-; PTX-NOT: ld.u32
-; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0];
-;
+; PTX: {
+; PTX-NEXT: .reg .b32 %r<4>;
+; PTX-NEXT: .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT: // %bb.0:
+; PTX-NEXT: ld.param.u64 %rd1, [grid_const_int_param_2];
+; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT: ld.param.u32 %r1, [grid_const_int_param_1];
+; PTX-NEXT: ld.param.u32 %r2, [grid_const_int_param_0];
+; PTX-NEXT: add.s32 %r3, %r2, %r1;
+; PTX-NEXT: st.global.u32 [%rd2], %r3;
+; PTX-NEXT: ret;
; OPT-LABEL: define void @grid_const_int(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) {
-; OPT-NOT: alloca
-; OPT: [[INPUT11:%.*]] = addrspacecast ptr [[INPU...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/106423
More information about the llvm-commits
mailing list