[llvm] [NVPTX] Improve copy avoidance during lowering. (PR #106423)

via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 28 10:43:48 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-analysis

Author: Artem Belevich (Artem-B)

<details>
<summary>Changes</summary>

On newer GPUs, where `cvta.param` instruction is available we can avoid making
byval arguments when their pointers are used in a few more cases, even
when `__grid_constant__` is not specified.

- phi
- select
- memcpy from the parameter.

Switched pointer traversal from a DIY implementation to PtrUseVisitor.

Note: includes PtrUseVisitor patch #<!-- -->106308 which will land separately.

---

Patch is 77.94 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106423.diff


6 Files Affected:

- (modified) llvm/include/llvm/Analysis/PtrUseVisitor.h (+5-2) 
- (modified) llvm/lib/Analysis/PtrUseVisitor.cpp (+1-1) 
- (modified) llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp (+194-73) 
- (modified) llvm/lib/Target/NVPTX/NVPTXSubtarget.h (+3) 
- (modified) llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll (+374-182) 
- (modified) llvm/test/CodeGen/NVPTX/lower-byval-args.ll (+268-134) 


``````````diff
diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index b6cc14d2077af0..539d302bb70a1b 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -157,7 +157,7 @@ class PtrUseVisitorBase {
   ///
   /// This will visit the users with the same offset of the current visit
   /// (including an unknown offset if that is the current state).
-  void enqueueUsers(Instruction &I);
+  void enqueueUsers(Value &I);
 
   /// Walk the operands of a GEP and adjust the offset as appropriate.
   ///
@@ -208,11 +208,14 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
 
   /// Recursively visit the uses of the given pointer.
   /// \returns An info struct about the pointer. See \c PtrInfo for details.
-  PtrInfo visitPtr(Instruction &I) {
+  /// We may also need to process Argument pointers, so the input uses is 
+  /// a common Value type.
+  PtrInfo visitPtr(Value &I) {
     // This must be a pointer type. Get an integer type suitable to hold
     // offsets on this pointer.
     // FIXME: Support a vector of pointers.
     assert(I.getType()->isPointerTy());
+    assert(isa<Instruction>(I) || isa<Argument>(I));
     IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(I.getType()));
     IsOffsetKnown = true;
     Offset = APInt(IntIdxTy->getBitWidth(), 0);
diff --git a/llvm/lib/Analysis/PtrUseVisitor.cpp b/llvm/lib/Analysis/PtrUseVisitor.cpp
index 49304818d7efed..9c79546f491eff 100644
--- a/llvm/lib/Analysis/PtrUseVisitor.cpp
+++ b/llvm/lib/Analysis/PtrUseVisitor.cpp
@@ -17,7 +17,7 @@
 
 using namespace llvm;
 
-void detail::PtrUseVisitorBase::enqueueUsers(Instruction &I) {
+void detail::PtrUseVisitorBase::enqueueUsers(Value &I) {
   for (Use &U : I.uses()) {
     if (VisitedUses.insert(&U).second) {
       UseToVisit NewU = {
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 1205ad4c6b008f..243f39d8a16719 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -79,15 +79,15 @@
 //
 //    define void @foo({i32*, i32*}* byval %input) {
 //      %b_param = addrspacecat ptr %input to ptr addrspace(101)
-//      %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1
-//      %b = load ptr, ptr addrspace(101) %b_ptr
-//      %b_global = addrspacecast ptr %b to ptr addrspace(1)
-//      ; use %b_generic
+//      %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0,
+//      i32 1 %b = load ptr, ptr addrspace(101) %b_ptr %b_global = addrspacecast
+//      ptr %b to ptr addrspace(1) ; use %b_generic
 //    }
 //
-//    Create a local copy of kernel byval parameters used in a way that *might* mutate
-//    the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters
-//    are undefined behaviour, and don't require local copies.
+//    Create a local copy of kernel byval parameters used in a way that *might*
+//    mutate the parameter, by storing it in an alloca. Mutations to
+//    "grid_constant" parameters are undefined behaviour, and don't require
+//    local copies.
 //
 //    define void @foo(ptr byval(%struct.s) align 4 %input) {
 //       store i32 42, ptr %input
@@ -124,11 +124,11 @@
 //
 //    define void @foo(ptr byval(%struct.s) %input) {
 //      %input1 = addrspacecast ptr %input to ptr addrspace(101)
-//      ; the following intrinsic converts pointer to generic. We don't use an addrspacecast
-//      ; to prevent generic -> param -> generic from getting cancelled out
-//      %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1)
-//      %call = call i32 @escape(ptr %input1.gen)
-//      ret void
+//      ; the following intrinsic converts pointer to generic. We don't use an
+//      addrspacecast ; to prevent generic -> param -> generic from getting
+//      cancelled out %input1.gen = call ptr
+//      @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1) %call =
+//      call i32 @escape(ptr %input1.gen) ret void
 //    }
 //
 // TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
@@ -139,16 +139,21 @@
 #include "NVPTX.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <numeric>
 #include <queue>
 
@@ -217,7 +222,8 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args",
 // pointer in parameter AS.
 // For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to
 // generic using cvta.param.
-static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
+static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam,
+                             bool IsGridConstant) {
   Instruction *I = dyn_cast<Instruction>(OldUse->getUser());
   assert(I && "OldUse must be in an instruction");
   struct IP {
@@ -228,7 +234,8 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
   SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}};
   SmallVector<Instruction *> InstructionsToDelete;
 
-  auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * {
+  auto CloneInstInParamAS = [HasCvtaParam,
+                             IsGridConstant](const IP &I) -> Value * {
     if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {
       LI->setOperand(0, I.NewParam);
       return LI;
@@ -252,8 +259,25 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
       // Just pass through the argument, the old ASC is no longer needed.
       return I.NewParam;
     }
+    if (auto *MI = dyn_cast<MemTransferInst>(I.OldInstruction)) {
+      if (MI->getRawSource() == I.OldUse->get()) {
+        // convert to memcpy/memmove from param space.
+        IRBuilder<> Builder(I.OldInstruction);
+        Intrinsic::ID ID = MI->getIntrinsicID();
+
+        CallInst *B = Builder.CreateMemTransferInst(
+            ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam,
+            MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
+        for (unsigned I : {0, 1})
+          if (uint64_t Bytes = MI->getParamDereferenceableBytes(I))
+            B->addDereferenceableParamAttr(I, Bytes);
+        return B;
+      }
+      // We may be able to handle other cases if the argument is
+      // __grid_constant__
+    }
 
-    if (GridConstant) {
+    if (HasCvtaParam) {
       auto GetParamAddrCastToGeneric =
           [](Value *Addr, Instruction *OriginalUser) -> Value * {
         PointerType *ReturnTy =
@@ -269,24 +293,44 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
                              OriginalUser->getIterator());
         return CvtToGenCall;
       };
-
-      if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
-        I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI));
-        return CI;
+      auto *ParamInGenericAS =
+          GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction);
+
+      // phi/select could use generic arg pointers w/o __grid_constant__
+      if (auto *PHI = dyn_cast<PHINode>(I.OldInstruction)) {
+        for (auto [Idx, V] : enumerate(PHI->incoming_values())) {
+          if (V.get() == I.OldUse->get())
+            PHI->setIncomingValue(Idx, ParamInGenericAS);
+        }
       }
-      if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
-        // byval address is being stored, cast it to generic
-        if (SI->getValueOperand() == I.OldUse->get())
-          SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI));
-        return SI;
+      if (auto *SI = dyn_cast<SelectInst>(I.OldInstruction)) {
+        if (SI->getTrueValue() == I.OldUse->get())
+          SI->setTrueValue(ParamInGenericAS);
+        if (SI->getFalseValue() == I.OldUse->get())
+          SI->setFalseValue(ParamInGenericAS);
       }
-      if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
-        if (PI->getPointerOperand() == I.OldUse->get())
-          PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI));
-        return PI;
+
+      // Escapes or writes can only use generic param pointers if
+      // __grid_constant__ is in effect.
+      if (IsGridConstant) {
+        if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
+          I.OldUse->set(ParamInGenericAS);
+          return CI;
+        }
+        if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
+          // byval address is being stored, cast it to generic
+          if (SI->getValueOperand() == I.OldUse->get())
+            SI->setOperand(0, ParamInGenericAS);
+          return SI;
+        }
+        if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
+          if (PI->getPointerOperand() == I.OldUse->get())
+            PI->setOperand(0, ParamInGenericAS);
+          return PI;
+        }
+        // TODO: iIf we allow stores, we should allow memcpy/memset to
+        // parameter, too.
       }
-      llvm_unreachable(
-          "Instruction unsupported even for grid_constant argument");
     }
 
     llvm_unreachable("Unsupported instruction");
@@ -409,49 +453,121 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
   }
 }
 
+namespace {
+struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
+  using Base = PtrUseVisitor<ArgUseChecker>;
+
+  bool IsGridConstant;
+  SmallPtrSet<Value *, 16> AllArgUsers;
+  // Set of phi/select instructions using the Arg
+  SmallPtrSet<Instruction *, 4> Conditionals;
+
+  ArgUseChecker(const DataLayout &DL, bool IsGridConstant)
+      : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {}
+
+  PtrInfo visitArgPtr(Argument &A) {
+    assert(A.getType()->isPointerTy());
+    IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(A.getType()));
+    IsOffsetKnown = false;
+    Offset = APInt(IntIdxTy->getBitWidth(), 0);
+    PI.reset();
+    AllArgUsers.clear();
+    Conditionals.clear();
+
+    LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n");
+    // Enqueue the uses of this pointer.
+    enqueueUsers(A);
+    AllArgUsers.insert(&A);
+
+    // Visit all the uses off the worklist until it is empty.
+    // Note that unlike PtrUseVisitor we're intentionally do not track offset.
+    // We're only interested in how we use the pointer.
+    while (!(Worklist.empty() || PI.isAborted())) {
+      UseToVisit ToVisit = Worklist.pop_back_val();
+      U = ToVisit.UseAndIsOffsetKnown.getPointer();
+      Instruction *I = cast<Instruction>(U->getUser());
+      AllArgUsers.insert(I);
+      if (isa<PHINode>(I) || isa<SelectInst>(I))
+        Conditionals.insert(I);
+      LLVM_DEBUG(dbgs() << "Processing " << *I << "\n");
+      Base::visit(I);
+    }
+    if (PI.isEscaped())
+      LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst()
+                        << "\n");
+    else if (PI.isAborted())
+      LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst()
+                        << "\n");
+    LLVM_DEBUG(dbgs() << "Traversed " << AllArgUsers.size() << " with "
+                      << Conditionals.size() << " conditionals\n");
+    return PI;
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    // Storing the pointer escapes it.
+    if (U->get() == SI.getValueOperand())
+      return PI.setEscapedAndAborted(&SI);
+    // Writes to the pointer are UB w/ __gid_constant__, but do not force a
+    // copy.
+    if (!IsGridConstant)
+      return PI.setAborted(&SI);
+  }
+
+  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+    // ASC to param space are no-ops and do not need a copy
+    if (ASC.getDestAddressSpace() != ADDRESS_SPACE_PARAM)
+      return PI.setEscapedAndAborted(&ASC);
+    Base::visitAddrSpaceCastInst(ASC);
+  }
+
+  void visitPtrToIntInst(PtrToIntInst &I) {
+    if (IsGridConstant)
+      return;
+    Base::visitPtrToIntInst(I);
+  }
+  void visitPHINodeOrSelectInst(Instruction &I) {
+    assert(isa<PHINode>(I) || isa<SelectInst>(I));
+  }
+  // PHI and select just pass through the pointers.
+  void visitPHINode(PHINode &PN) { enqueueUsers(PN); }
+  void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); }
+
+  void visitMemTransferInst(MemTransferInst &II) {
+    if (*U == II.getRawDest() && !IsGridConstant)
+      PI.setAborted(&II);
+
+    // TODO: memcpy from arg is OK as it can get unrolled into ld.param.
+    // However, memcpys are currently expected to be unrolled before we
+    // get here, so we never see them in practice, and we do not currently
+    // handle them when we convert IR to access param space directly. So,
+    // we'll mark it as an escape for now. It would still force a copy on
+    // pre-sm_70 GPUs where we can't take address of a parameter w/o a copy.
+    //
+    // PI.setEscaped(&II);
+  }
+
+  void visitMemSetInst(MemSetInst &II) {
+    if (*U == II.getRawDest() && !IsGridConstant)
+      PI.setAborted(&II);
+  }
+  // debug only helper.
+  auto &getVisitedUses() { return VisitedUses; }
+};
+} // namespace
 void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
                                       Argument *Arg) {
-  bool IsGridConstant = isParamGridConstant(*Arg);
   Function *Func = Arg->getParent();
+  bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();
+  bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
+  const DataLayout &DL = Func->getDataLayout();
   BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
   Type *StructType = Arg->getParamByValType();
   assert(StructType && "Missing byval type");
 
-  auto AreSupportedUsers = [&](Value *Start) {
-    SmallVector<Value *, 16> ValuesToCheck = {Start};
-    auto IsSupportedUse = [IsGridConstant](Value *V) -> bool {
-      if (isa<GetElementPtrInst>(V) || isa<BitCastInst>(V) || isa<LoadInst>(V))
-        return true;
-      // ASC to param space are OK, too -- we'll just strip them.
-      if (auto *ASC = dyn_cast<AddrSpaceCastInst>(V)) {
-        if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM)
-          return true;
-      }
-      // Simple calls and stores are supported for grid_constants
-      // writes to these pointers are undefined behaviour
-      if (IsGridConstant &&
-          (isa<CallInst>(V) || isa<StoreInst>(V) || isa<PtrToIntInst>(V)))
-        return true;
-      return false;
-    };
-
-    while (!ValuesToCheck.empty()) {
-      Value *V = ValuesToCheck.pop_back_val();
-      if (!IsSupportedUse(V)) {
-        LLVM_DEBUG(dbgs() << "Need a "
-                          << (isParamGridConstant(*Arg) ? "cast " : "copy ")
-                          << "of " << *Arg << " because of " << *V << "\n");
-        (void)Arg;
-        return false;
-      }
-      if (!isa<LoadInst>(V) && !isa<CallInst>(V) && !isa<StoreInst>(V) &&
-          !isa<PtrToIntInst>(V))
-        llvm::append_range(ValuesToCheck, V->users());
-    }
-    return true;
-  };
-
-  if (llvm::all_of(Arg->users(), AreSupportedUsers)) {
+  ArgUseChecker AUC(DL, IsGridConstant);
+  ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);
+  // Easy case, accessing parameter directly is fine.
+  if (!(PI.isEscaped() || PI.isAborted()) && AUC.Conditionals.empty()) {
     // Convert all loads and intermediate operations to use parameter AS and
     // skip creation of a local copy of the argument.
     SmallVector<Use *, 16> UsesToUpdate;
@@ -462,7 +578,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
         Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
         FirstInst);
     for (Use *U : UsesToUpdate)
-      convertToParamAS(U, ArgInParamAS, IsGridConstant);
+      convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
     LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
 
     const auto *TLI =
@@ -473,13 +589,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
     return;
   }
 
-  const DataLayout &DL = Func->getDataLayout();
+  // We can't access byval arg directly and need a pointer. on sm_70+ we have
+  // ability to take a pointer to the argument without making a local copy.
+  // However, we're still not allowed to write to it. If the user specified
+  // `__grid_constant__` for the argument, we'll consider escaped pointer as
+  // read-only.
   unsigned AS = DL.getAllocaAddrSpace();
-  if (isParamGridConstant(*Arg)) {
-    // Writes to a grid constant are undefined behaviour. We do not need a
-    // temporary copy. When a pointer might have escaped, conservatively replace
-    // all of its uses (which might include a device function call) with a cast
-    // to the generic address space.
+  if (HasCvtaParam && (!(PI.isEscaped() || PI.isAborted()) || IsGridConstant)) {
+    LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
+    // Replace all argument pointer uses (which might include a device function
+    // call) with a cast to the generic address space using cvta.param
+    // instruction, which avoids a local copy.
     IRBuilder<> IRB(&Func->getEntryBlock().front());
 
     // Cast argument to param address space
@@ -500,6 +620,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
     // Do not replace Arg in the cast to param space
     CastToParam->setOperand(0, Arg);
   } else {
+    LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n");
     // Otherwise we have to create a temporary copy.
     AllocaInst *AllocA =
         new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e47050734aae1e..38b5ee63f4b2b0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -93,6 +93,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool hasDotInstructions() const {
     return SmVersion >= 61 && PTXVersion >= 50;
   }
+  bool hasCvtaParam() const {
+    return SmVersion >= 70 && PTXVersion >= 77;
+  }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
   // GPUs with "a" suffix have include architecture-accelerated features that
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index f6db9c429dba57..176dfee11cfb09 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -1,18 +1,30 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT
-; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX
+; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
+; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
 
 define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
 ; PTX-LABEL: grid_const_int(
-; PTX-NOT:     ld.u32
-; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0];
-; 
+; PTX:       {
+; PTX-NEXT:    .reg .b32 %r<4>;
+; PTX-NEXT:    .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    ld.param.u64 %rd1, [grid_const_int_param_2];
+; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT:    ld.param.u32 %r1, [grid_const_int_param_1];
+; PTX-NEXT:    ld.param.u32 %r2, [grid_const_int_param_0];
+; PTX-NEXT:    add.s32 %r3, %r2, %r1;
+; PTX-NEXT:    st.global.u32 [%rd2], %r3;
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_int(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) {
-; OPT-NOT:     alloca
-; OPT:         [[INPUT11:%.*]] = addrspacecast ptr [[INPU...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/106423


More information about the llvm-commits mailing list