[llvm] [NVPTX] Improve copy avoidance during lowering. (PR #106423)

Artem Belevich via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 28 12:48:06 PDT 2024


https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/106423

>From b5bc3ad2ae818899c917849eaac9fe8856c6c9a3 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Tue, 27 Aug 2024 16:20:35 -0700
Subject: [PATCH 1/3] [PtrUseVisitor] Allow using Argument as a starting point

Argument is another possible starting point for the pointer traversal,
and PtrUseVisitor should be able to handle it.
---
 llvm/include/llvm/Analysis/PtrUseVisitor.h | 7 +++++--
 llvm/lib/Analysis/PtrUseVisitor.cpp        | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index b6cc14d2077af0..539d302bb70a1b 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -157,7 +157,7 @@ class PtrUseVisitorBase {
   ///
   /// This will visit the users with the same offset of the current visit
   /// (including an unknown offset if that is the current state).
-  void enqueueUsers(Instruction &I);
+  void enqueueUsers(Value &I);
 
   /// Walk the operands of a GEP and adjust the offset as appropriate.
   ///
@@ -208,11 +208,14 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
 
   /// Recursively visit the uses of the given pointer.
   /// \returns An info struct about the pointer. See \c PtrInfo for details.
-  PtrInfo visitPtr(Instruction &I) {
+  /// We may also need to process Argument pointers, so the input uses is 
+  /// a common Value type.
+  PtrInfo visitPtr(Value &I) {
     // This must be a pointer type. Get an integer type suitable to hold
     // offsets on this pointer.
     // FIXME: Support a vector of pointers.
     assert(I.getType()->isPointerTy());
+    assert(isa<Instruction>(I) || isa<Argument>(I));
     IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(I.getType()));
     IsOffsetKnown = true;
     Offset = APInt(IntIdxTy->getBitWidth(), 0);
diff --git a/llvm/lib/Analysis/PtrUseVisitor.cpp b/llvm/lib/Analysis/PtrUseVisitor.cpp
index 49304818d7efed..9c79546f491eff 100644
--- a/llvm/lib/Analysis/PtrUseVisitor.cpp
+++ b/llvm/lib/Analysis/PtrUseVisitor.cpp
@@ -17,7 +17,7 @@
 
 using namespace llvm;
 
-void detail::PtrUseVisitorBase::enqueueUsers(Instruction &I) {
+void detail::PtrUseVisitorBase::enqueueUsers(Value &I) {
   for (Use &U : I.uses()) {
     if (VisitedUses.insert(&U).second) {
       UseToVisit NewU = {

>From a8835f7c339c4f973eba977361dac54e405f1d9f Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Fri, 23 Aug 2024 17:36:37 -0700
Subject: [PATCH 2/3] [NVPTX] Improve copy avoidance during lowering.

On newer GPUs, where `cvta.param` instruction is available we can avoid making
byval arguments when their pointers are used in a few more cases, even
when __grid_constant__ is not specified.

- phi
- select
- memcpy from the parameter.
---
 llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp      | 267 ++++++---
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h        |   3 +
 .../CodeGen/NVPTX/lower-args-gridconstant.ll  | 556 ++++++++++++------
 llvm/test/CodeGen/NVPTX/lower-byval-args.ll   | 402 ++++++++-----
 4 files changed, 839 insertions(+), 389 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 1205ad4c6b008f..243f39d8a16719 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -79,15 +79,15 @@
 //
 //    define void @foo({i32*, i32*}* byval %input) {
 //      %b_param = addrspacecat ptr %input to ptr addrspace(101)
-//      %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0, i32 1
-//      %b = load ptr, ptr addrspace(101) %b_ptr
-//      %b_global = addrspacecast ptr %b to ptr addrspace(1)
-//      ; use %b_generic
+//      %b_ptr = getelementptr {ptr, ptr}, ptr addrspace(101) %b_param, i64 0,
+//      i32 1 %b = load ptr, ptr addrspace(101) %b_ptr %b_global = addrspacecast
+//      ptr %b to ptr addrspace(1) ; use %b_generic
 //    }
 //
-//    Create a local copy of kernel byval parameters used in a way that *might* mutate
-//    the parameter, by storing it in an alloca. Mutations to "grid_constant" parameters
-//    are undefined behaviour, and don't require local copies.
+//    Create a local copy of kernel byval parameters used in a way that *might*
+//    mutate the parameter, by storing it in an alloca. Mutations to
+//    "grid_constant" parameters are undefined behaviour, and don't require
+//    local copies.
 //
 //    define void @foo(ptr byval(%struct.s) align 4 %input) {
 //       store i32 42, ptr %input
@@ -124,11 +124,11 @@
 //
 //    define void @foo(ptr byval(%struct.s) %input) {
 //      %input1 = addrspacecast ptr %input to ptr addrspace(101)
-//      ; the following intrinsic converts pointer to generic. We don't use an addrspacecast
-//      ; to prevent generic -> param -> generic from getting cancelled out
-//      %input1.gen = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1)
-//      %call = call i32 @escape(ptr %input1.gen)
-//      ret void
+//      ; the following intrinsic converts pointer to generic. We don't use an
+//      addrspacecast ; to prevent generic -> param -> generic from getting
+//      cancelled out %input1.gen = call ptr
+//      @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) %input1) %call =
+//      call i32 @escape(ptr %input1.gen) ret void
 //    }
 //
 // TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't
@@ -139,16 +139,21 @@
 #include "NVPTX.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <numeric>
 #include <queue>
 
@@ -217,7 +222,8 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args",
 // pointer in parameter AS.
 // For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to
 // generic using cvta.param.
-static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
+static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam,
+                             bool IsGridConstant) {
   Instruction *I = dyn_cast<Instruction>(OldUse->getUser());
   assert(I && "OldUse must be in an instruction");
   struct IP {
@@ -228,7 +234,8 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
   SmallVector<IP> ItemsToConvert = {{OldUse, I, Param}};
   SmallVector<Instruction *> InstructionsToDelete;
 
-  auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * {
+  auto CloneInstInParamAS = [HasCvtaParam,
+                             IsGridConstant](const IP &I) -> Value * {
     if (auto *LI = dyn_cast<LoadInst>(I.OldInstruction)) {
       LI->setOperand(0, I.NewParam);
       return LI;
@@ -252,8 +259,25 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
       // Just pass through the argument, the old ASC is no longer needed.
       return I.NewParam;
     }
+    if (auto *MI = dyn_cast<MemTransferInst>(I.OldInstruction)) {
+      if (MI->getRawSource() == I.OldUse->get()) {
+        // convert to memcpy/memmove from param space.
+        IRBuilder<> Builder(I.OldInstruction);
+        Intrinsic::ID ID = MI->getIntrinsicID();
+
+        CallInst *B = Builder.CreateMemTransferInst(
+            ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam,
+            MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
+        for (unsigned I : {0, 1})
+          if (uint64_t Bytes = MI->getParamDereferenceableBytes(I))
+            B->addDereferenceableParamAttr(I, Bytes);
+        return B;
+      }
+      // We may be able to handle other cases if the argument is
+      // __grid_constant__
+    }
 
-    if (GridConstant) {
+    if (HasCvtaParam) {
       auto GetParamAddrCastToGeneric =
           [](Value *Addr, Instruction *OriginalUser) -> Value * {
         PointerType *ReturnTy =
@@ -269,24 +293,44 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) {
                              OriginalUser->getIterator());
         return CvtToGenCall;
       };
-
-      if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
-        I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI));
-        return CI;
+      auto *ParamInGenericAS =
+          GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction);
+
+      // phi/select could use generic arg pointers w/o __grid_constant__
+      if (auto *PHI = dyn_cast<PHINode>(I.OldInstruction)) {
+        for (auto [Idx, V] : enumerate(PHI->incoming_values())) {
+          if (V.get() == I.OldUse->get())
+            PHI->setIncomingValue(Idx, ParamInGenericAS);
+        }
       }
-      if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
-        // byval address is being stored, cast it to generic
-        if (SI->getValueOperand() == I.OldUse->get())
-          SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI));
-        return SI;
+      if (auto *SI = dyn_cast<SelectInst>(I.OldInstruction)) {
+        if (SI->getTrueValue() == I.OldUse->get())
+          SI->setTrueValue(ParamInGenericAS);
+        if (SI->getFalseValue() == I.OldUse->get())
+          SI->setFalseValue(ParamInGenericAS);
       }
-      if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
-        if (PI->getPointerOperand() == I.OldUse->get())
-          PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI));
-        return PI;
+
+      // Escapes or writes can only use generic param pointers if
+      // __grid_constant__ is in effect.
+      if (IsGridConstant) {
+        if (auto *CI = dyn_cast<CallInst>(I.OldInstruction)) {
+          I.OldUse->set(ParamInGenericAS);
+          return CI;
+        }
+        if (auto *SI = dyn_cast<StoreInst>(I.OldInstruction)) {
+          // byval address is being stored, cast it to generic
+          if (SI->getValueOperand() == I.OldUse->get())
+            SI->setOperand(0, ParamInGenericAS);
+          return SI;
+        }
+        if (auto *PI = dyn_cast<PtrToIntInst>(I.OldInstruction)) {
+          if (PI->getPointerOperand() == I.OldUse->get())
+            PI->setOperand(0, ParamInGenericAS);
+          return PI;
+        }
+        // TODO: iIf we allow stores, we should allow memcpy/memset to
+        // parameter, too.
       }
-      llvm_unreachable(
-          "Instruction unsupported even for grid_constant argument");
     }
 
     llvm_unreachable("Unsupported instruction");
@@ -409,49 +453,121 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
   }
 }
 
+namespace {
+struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
+  using Base = PtrUseVisitor<ArgUseChecker>;
+
+  bool IsGridConstant;
+  SmallPtrSet<Value *, 16> AllArgUsers;
+  // Set of phi/select instructions using the Arg
+  SmallPtrSet<Instruction *, 4> Conditionals;
+
+  ArgUseChecker(const DataLayout &DL, bool IsGridConstant)
+      : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {}
+
+  PtrInfo visitArgPtr(Argument &A) {
+    assert(A.getType()->isPointerTy());
+    IntegerType *IntIdxTy = cast<IntegerType>(DL.getIndexType(A.getType()));
+    IsOffsetKnown = false;
+    Offset = APInt(IntIdxTy->getBitWidth(), 0);
+    PI.reset();
+    AllArgUsers.clear();
+    Conditionals.clear();
+
+    LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n");
+    // Enqueue the uses of this pointer.
+    enqueueUsers(A);
+    AllArgUsers.insert(&A);
+
+    // Visit all the uses off the worklist until it is empty.
+    // Note that unlike PtrUseVisitor we're intentionally do not track offset.
+    // We're only interested in how we use the pointer.
+    while (!(Worklist.empty() || PI.isAborted())) {
+      UseToVisit ToVisit = Worklist.pop_back_val();
+      U = ToVisit.UseAndIsOffsetKnown.getPointer();
+      Instruction *I = cast<Instruction>(U->getUser());
+      AllArgUsers.insert(I);
+      if (isa<PHINode>(I) || isa<SelectInst>(I))
+        Conditionals.insert(I);
+      LLVM_DEBUG(dbgs() << "Processing " << *I << "\n");
+      Base::visit(I);
+    }
+    if (PI.isEscaped())
+      LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst()
+                        << "\n");
+    else if (PI.isAborted())
+      LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst()
+                        << "\n");
+    LLVM_DEBUG(dbgs() << "Traversed " << AllArgUsers.size() << " with "
+                      << Conditionals.size() << " conditionals\n");
+    return PI;
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    // Storing the pointer escapes it.
+    if (U->get() == SI.getValueOperand())
+      return PI.setEscapedAndAborted(&SI);
+    // Writes to the pointer are UB w/ __gid_constant__, but do not force a
+    // copy.
+    if (!IsGridConstant)
+      return PI.setAborted(&SI);
+  }
+
+  void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+    // ASC to param space are no-ops and do not need a copy
+    if (ASC.getDestAddressSpace() != ADDRESS_SPACE_PARAM)
+      return PI.setEscapedAndAborted(&ASC);
+    Base::visitAddrSpaceCastInst(ASC);
+  }
+
+  void visitPtrToIntInst(PtrToIntInst &I) {
+    if (IsGridConstant)
+      return;
+    Base::visitPtrToIntInst(I);
+  }
+  void visitPHINodeOrSelectInst(Instruction &I) {
+    assert(isa<PHINode>(I) || isa<SelectInst>(I));
+  }
+  // PHI and select just pass through the pointers.
+  void visitPHINode(PHINode &PN) { enqueueUsers(PN); }
+  void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); }
+
+  void visitMemTransferInst(MemTransferInst &II) {
+    if (*U == II.getRawDest() && !IsGridConstant)
+      PI.setAborted(&II);
+
+    // TODO: memcpy from arg is OK as it can get unrolled into ld.param.
+    // However, memcpys are currently expected to be unrolled before we
+    // get here, so we never see them in practice, and we do not currently
+    // handle them when we convert IR to access param space directly. So,
+    // we'll mark it as an escape for now. It would still force a copy on
+    // pre-sm_70 GPUs where we can't take address of a parameter w/o a copy.
+    //
+    // PI.setEscaped(&II);
+  }
+
+  void visitMemSetInst(MemSetInst &II) {
+    if (*U == II.getRawDest() && !IsGridConstant)
+      PI.setAborted(&II);
+  }
+  // debug only helper.
+  auto &getVisitedUses() { return VisitedUses; }
+};
+} // namespace
 void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
                                       Argument *Arg) {
-  bool IsGridConstant = isParamGridConstant(*Arg);
   Function *Func = Arg->getParent();
+  bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();
+  bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
+  const DataLayout &DL = Func->getDataLayout();
   BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
   Type *StructType = Arg->getParamByValType();
   assert(StructType && "Missing byval type");
 
-  auto AreSupportedUsers = [&](Value *Start) {
-    SmallVector<Value *, 16> ValuesToCheck = {Start};
-    auto IsSupportedUse = [IsGridConstant](Value *V) -> bool {
-      if (isa<GetElementPtrInst>(V) || isa<BitCastInst>(V) || isa<LoadInst>(V))
-        return true;
-      // ASC to param space are OK, too -- we'll just strip them.
-      if (auto *ASC = dyn_cast<AddrSpaceCastInst>(V)) {
-        if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM)
-          return true;
-      }
-      // Simple calls and stores are supported for grid_constants
-      // writes to these pointers are undefined behaviour
-      if (IsGridConstant &&
-          (isa<CallInst>(V) || isa<StoreInst>(V) || isa<PtrToIntInst>(V)))
-        return true;
-      return false;
-    };
-
-    while (!ValuesToCheck.empty()) {
-      Value *V = ValuesToCheck.pop_back_val();
-      if (!IsSupportedUse(V)) {
-        LLVM_DEBUG(dbgs() << "Need a "
-                          << (isParamGridConstant(*Arg) ? "cast " : "copy ")
-                          << "of " << *Arg << " because of " << *V << "\n");
-        (void)Arg;
-        return false;
-      }
-      if (!isa<LoadInst>(V) && !isa<CallInst>(V) && !isa<StoreInst>(V) &&
-          !isa<PtrToIntInst>(V))
-        llvm::append_range(ValuesToCheck, V->users());
-    }
-    return true;
-  };
-
-  if (llvm::all_of(Arg->users(), AreSupportedUsers)) {
+  ArgUseChecker AUC(DL, IsGridConstant);
+  ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg);
+  // Easy case, accessing parameter directly is fine.
+  if (!(PI.isEscaped() || PI.isAborted()) && AUC.Conditionals.empty()) {
     // Convert all loads and intermediate operations to use parameter AS and
     // skip creation of a local copy of the argument.
     SmallVector<Use *, 16> UsesToUpdate;
@@ -462,7 +578,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
         Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
         FirstInst);
     for (Use *U : UsesToUpdate)
-      convertToParamAS(U, ArgInParamAS, IsGridConstant);
+      convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
     LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n");
 
     const auto *TLI =
@@ -473,13 +589,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
     return;
   }
 
-  const DataLayout &DL = Func->getDataLayout();
+  // We can't access byval arg directly and need a pointer. on sm_70+ we have
+  // ability to take a pointer to the argument without making a local copy.
+  // However, we're still not allowed to write to it. If the user specified
+  // `__grid_constant__` for the argument, we'll consider escaped pointer as
+  // read-only.
   unsigned AS = DL.getAllocaAddrSpace();
-  if (isParamGridConstant(*Arg)) {
-    // Writes to a grid constant are undefined behaviour. We do not need a
-    // temporary copy. When a pointer might have escaped, conservatively replace
-    // all of its uses (which might include a device function call) with a cast
-    // to the generic address space.
+  if (HasCvtaParam && (!(PI.isEscaped() || PI.isAborted()) || IsGridConstant)) {
+    LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n");
+    // Replace all argument pointer uses (which might include a device function
+    // call) with a cast to the generic address space using cvta.param
+    // instruction, which avoids a local copy.
     IRBuilder<> IRB(&Func->getEntryBlock().front());
 
     // Cast argument to param address space
@@ -500,6 +620,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
     // Do not replace Arg in the cast to param space
     CastToParam->setOperand(0, Arg);
   } else {
+    LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n");
     // Otherwise we have to create a temporary copy.
     AllocaInst *AllocA =
         new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e47050734aae1e..38b5ee63f4b2b0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -93,6 +93,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool hasDotInstructions() const {
     return SmVersion >= 61 && PTXVersion >= 50;
   }
+  bool hasCvtaParam() const {
+    return SmVersion >= 70 && PTXVersion >= 77;
+  }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
   // GPUs with "a" suffix have include architecture-accelerated features that
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index f6db9c429dba57..176dfee11cfb09 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -1,18 +1,30 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT
-; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX
+; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
+; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
 
 define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
 ; PTX-LABEL: grid_const_int(
-; PTX-NOT:     ld.u32
-; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0];
-; 
+; PTX:       {
+; PTX-NEXT:    .reg .b32 %r<4>;
+; PTX-NEXT:    .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    ld.param.u64 %rd1, [grid_const_int_param_2];
+; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT:    ld.param.u32 %r1, [grid_const_int_param_1];
+; PTX-NEXT:    ld.param.u32 %r2, [grid_const_int_param_0];
+; PTX-NEXT:    add.s32 %r3, %r2, %r1;
+; PTX-NEXT:    st.global.u32 [%rd2], %r3;
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_int(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) {
-; OPT-NOT:     alloca
-; OPT:         [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT:         [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
-;
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT:    [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr
+; OPT-NEXT:    [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT:    [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
+; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
+; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT3]], align 4
+; OPT-NEXT:    ret void
   %tmp = load i32, ptr %input1, align 4
   %add = add i32 %tmp, %input2
   store i32 %add, ptr %out
@@ -24,19 +36,29 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 ; PTX-LABEL: grid_const_struct(
 ; PTX:       {
-; PTX-NOT:     ld.u32
-; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_struct_param_0];
-; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_struct_param_0+4];
-;
+; PTX-NEXT:    .reg .b32 %r<4>;
+; PTX-NEXT:    .reg .b64 %rd<3>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    ld.param.u64 %rd1, [grid_const_struct_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd2, %rd1;
+; PTX-NEXT:    ld.param.u32 %r1, [grid_const_struct_param_0];
+; PTX-NEXT:    ld.param.u32 %r2, [grid_const_struct_param_0+4];
+; PTX-NEXT:    add.s32 %r3, %r1, %r2;
+; PTX-NEXT:    st.global.u32 [%rd2], %r3;
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_struct(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) {
-; OPT-NOT:     alloca
-; OPT:         [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
-; OPT:         [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
-; OPT:         [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
-; OPT:         [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr
+; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
+; OPT-NEXT:    [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
+; OPT-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
+; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
+; OPT-NEXT:    [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; OPT-NEXT:    store i32 [[ADD]], ptr [[OUT5]], align 4
+; OPT-NEXT:    ret void
   %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
   %int1 = load i32, ptr %gep1
@@ -49,41 +71,85 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
 define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 ; PTX-LABEL: grid_const_escape(
 ; PTX:       {
-; PTX-NOT:     .local
-; PTX:         cvta.param.{{.*}}
+; PTX-NEXT:    .reg .b32 %r<3>;
+; PTX-NEXT:    .reg .b64 %rd<4>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_escape_param_0;
+; PTX-NEXT:    mov.u64 %rd2, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd3, %rd2;
+; PTX-NEXT:    { // callseq 0, 0
+; PTX-NEXT:    .param .b64 param0;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd3;
+; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    call.uni (retval0),
+; PTX-NEXT:    escape,
+; PTX-NEXT:    (
+; PTX-NEXT:    param0
+; PTX-NEXT:    );
+; PTX-NEXT:    ld.param.b32 %r1, [retval0+0];
+; PTX-NEXT:    } // callseq 0
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) {
-; OPT-NOT:     alloca [[STRUCT_S]]
-; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT:         [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
+; OPT-NEXT:    ret void
   %call = call i32 @escape(ptr %input)
   ret void
 }
 
 define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
 ; PTX-LABEL: multiple_grid_const_escape(
-; PTX:         mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0;
-; PTX:         mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2;
-; PTX:         mov.{{.*}} [[RD3:%.*]], [[RD2]];
-; PTX:         mov.{{.*}} [[RD4:%.*]], [[RD1]];
-; PTX:         cvta.param.{{.*}} [[RD5:%.*]], [[RD4]];
-; PTX:         cvta.param.{{.*}} [[RD6:%.*]], [[RD3]];
-; PTX:         {
-; PTX:         st.param.b64 [param0+0], [[RD5]];
-; PTX:         st.param.b64 [param2+0], [[RD6]];
-;
+; PTX:       {
+; PTX-NEXT:    .local .align 4 .b8 __local_depot3[4];
+; PTX-NEXT:    .reg .b64 %SP;
+; PTX-NEXT:    .reg .b64 %SPL;
+; PTX-NEXT:    .reg .b32 %r<4>;
+; PTX-NEXT:    .reg .b64 %rd<9>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.u64 %SPL, __local_depot3;
+; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
+; PTX-NEXT:    mov.b64 %rd1, multiple_grid_const_escape_param_0;
+; PTX-NEXT:    mov.b64 %rd2, multiple_grid_const_escape_param_2;
+; PTX-NEXT:    mov.u64 %rd3, %rd2;
+; PTX-NEXT:    ld.param.u32 %r1, [multiple_grid_const_escape_param_1];
+; PTX-NEXT:    cvta.param.u64 %rd4, %rd3;
+; PTX-NEXT:    mov.u64 %rd5, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd6, %rd5;
+; PTX-NEXT:    add.u64 %rd7, %SP, 0;
+; PTX-NEXT:    add.u64 %rd8, %SPL, 0;
+; PTX-NEXT:    st.local.u32 [%rd8], %r1;
+; PTX-NEXT:    { // callseq 1, 0
+; PTX-NEXT:    .param .b64 param0;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd6;
+; PTX-NEXT:    .param .b64 param1;
+; PTX-NEXT:    st.param.b64 [param1+0], %rd7;
+; PTX-NEXT:    .param .b64 param2;
+; PTX-NEXT:    st.param.b64 [param2+0], %rd4;
+; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    call.uni (retval0),
+; PTX-NEXT:    escape3,
+; PTX-NEXT:    (
+; PTX-NEXT:    param0,
+; PTX-NEXT:    param1,
+; PTX-NEXT:    param2
+; PTX-NEXT:    );
+; PTX-NEXT:    ld.param.b32 %r2, [retval0+0];
+; PTX-NEXT:    } // callseq 1
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @multiple_grid_const_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) {
-; OPT:         [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
-; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT-NOT:     alloca %struct.s
-; OPT:         [[A_ADDR:%.*]] = alloca i32, align 4
-; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT:         [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; OPT-NEXT:    [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; OPT-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 ; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
-;
+; OPT-NEXT:    ret void
   %a.addr = alloca i32, align 4
   store i32 %a, ptr %a.addr, align 4
   %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
@@ -92,40 +158,58 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32
 
 define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
 ; PTX-LABEL: grid_const_memory_escape(
-; PTX-NOT:     .local
-; PTX:         mov.b64 [[RD1:%.*]], grid_const_memory_escape_param_0;
-; PTX:         cvta.param.u64 [[RD3:%.*]], [[RD2:%.*]];
-; PTX:         st.global.u64 [[[RD4:%.*]]], [[RD3]];
-;
+; PTX:       {
+; PTX-NEXT:    .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_memory_escape_param_0;
+; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_memory_escape_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT:    mov.u64 %rd4, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT:    st.global.u64 [%rd3], %rd5;
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_memory_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) {
-; OPT-NOT:     alloca [[STRUCT_S]]
-; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
-; OPT:         store ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, align 8
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
+; OPT-NEXT:    [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr
+; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT:    store ptr [[INPUT1]], ptr [[ADDR5]], align 8
+; OPT-NEXT:    ret void
   store ptr %input, ptr %addr, align 8
   ret void
 }
 
 define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
 ; PTX-LABEL: grid_const_inlineasm_escape(
-; PTX-NOT      .local 
-; PTX:         add.{{.*}} [[RD2:%.*]], [[RD1:%.*]], 4;
-; PTX:         cvta.param.u64 [[RD4:%.*]], [[RD2]]
-; PTX:         cvta.param.u64 [[RD3:%.*]], [[RD1]]
-; PTX:         add.s64 [[RD5:%.*]], [[RD3]], [[RD4]];
-;
+; PTX:       {
+; PTX-NEXT:    .reg .b64 %rd<8>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd4, grid_const_inlineasm_escape_param_0;
+; PTX-NEXT:    ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd6, %rd5;
+; PTX-NEXT:    mov.u64 %rd7, %rd4;
+; PTX-NEXT:    cvta.param.u64 %rd2, %rd7;
+; PTX-NEXT:    add.s64 %rd3, %rd2, 4;
+; PTX-NEXT:    // begin inline asm
+; PTX-NEXT:    add.s64 %rd1, %rd2, %rd3;
+; PTX-NEXT:    // end inline asm
+; PTX-NEXT:    st.global.u64 [%rd6], %rd1;
+; PTX-NEXT:    ret;
+; PTX-NOT      .local
 ; OPT-LABEL: define void @grid_const_inlineasm_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) {
-; OPT-NOT:     alloca [[STRUCT_S]]
-; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[TMPPTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 0
-; OPT:         [[TMPPTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 1
-; OPT:         [[TMPPTR22_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR22]])
-; OPT:         [[TMPPTR13_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR13]])
-; OPT:         [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1)
+; OPT-NEXT:    [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr
+; OPT-NEXT:    [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT-NEXT:    [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
+; OPT-NEXT:    [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
+; OPT-NEXT:    [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
+; OPT-NEXT:    store i64 [[TMP2]], ptr [[RESULT5]], align 8
+; OPT-NEXT:    ret void
   %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
   %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
@@ -135,24 +219,42 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt
 
 define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escape(
-; PTX-NOT:     .local
-; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escape_param_0];
-; PTX:         add.{{.*}}
-; PTX:         cvta.param.u64 [[RD3:%.*]], {{%.*}}
-; PTX:         st.param.{{.*}} [param0+0], [[RD3]]
-; PTX:         call
-;
+; PTX:       {
+; PTX-NEXT:    .reg .b32 %r<5>;
+; PTX-NEXT:    .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_partial_escape_param_0;
+; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_partial_escape_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT:    mov.u64 %rd4, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT:    ld.u32 %r1, [%rd5];
+; PTX-NEXT:    add.s32 %r2, %r1, %r1;
+; PTX-NEXT:    st.global.u32 [%rd3], %r2;
+; PTX-NEXT:    { // callseq 2, 0
+; PTX-NEXT:    .param .b64 param0;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd5;
+; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    call.uni (retval0),
+; PTX-NEXT:    escape,
+; PTX-NEXT:    (
+; PTX-NEXT:    param0
+; PTX-NEXT:    );
+; PTX-NEXT:    ld.param.b32 %r3, [retval0+0];
+; PTX-NEXT:    } // callseq 2
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_partial_escape(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
-; OPT-NOT:     alloca
-; OPT:         [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT1]], align 4
-; OPT:         [[TWICE:%.*]] = add i32 [[VAL]], [[VAL]]
-; OPT:         store i32 [[TWICE]]
-; OPT:         [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
-; OPT:         [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
-; OPT:         ret void
-;
+; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
+; OPT-NEXT:    [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]])
+; OPT-NEXT:    [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4
+; OPT-NEXT:    [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]]
+; OPT-NEXT:    store i32 [[TWICE]], ptr [[OUTPUT5]], align 4
+; OPT-NEXT:    [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
+; OPT-NEXT:    ret void
   %val = load i32, ptr %input
   %twice = add i32 %val, %val
   store i32 %twice, ptr %output
@@ -163,27 +265,46 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
 define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
 ; PTX-LABEL: grid_const_partial_escapemem(
 ; PTX:       {
-; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escapemem_param_0];
-; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_partial_escapemem_param_0+4];
-; PTX:         cvta.param.{{.*}} [[RD5:%.*]], {{%.*}};
-; PTX:         st.global.{{.*}} [{{.*}}], [[RD5]];
-; PTX:         add.s32 [[R3:%.*]], [[R1]], [[R2]] 
-; PTX:         st.param.{{.*}} [param0+0], [[RD5]]
-; PTX:         escape
+; PTX-NEXT:    .reg .b32 %r<6>;
+; PTX-NEXT:    .reg .b64 %rd<6>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_partial_escapemem_param_0;
+; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_partial_escapemem_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT:    mov.u64 %rd4, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd5, %rd4;
+; PTX-NEXT:    ld.u32 %r1, [%rd5];
+; PTX-NEXT:    ld.u32 %r2, [%rd5+4];
+; PTX-NEXT:    st.global.u64 [%rd3], %rd5;
+; PTX-NEXT:    add.s32 %r3, %r1, %r2;
+; PTX-NEXT:    { // callseq 3, 0
+; PTX-NEXT:    .param .b64 param0;
+; PTX-NEXT:    st.param.b64 [param0+0], %rd5;
+; PTX-NEXT:    .param .b32 retval0;
+; PTX-NEXT:    call.uni (retval0),
+; PTX-NEXT:    escape,
+; PTX-NEXT:    (
+; PTX-NEXT:    param0
+; PTX-NEXT:    );
+; PTX-NEXT:    ld.param.b32 %r4, [retval0+0];
+; PTX-NEXT:    } // callseq 3
+; PTX-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define i32 @grid_const_partial_escapemem(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr {{%.*}}) {
-; OPT-NOT:     alloca
-; OPT:         [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:         [[PTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 0
-; OPT:         [[VAL1:%.*]] = load i32, ptr addrspace(101) [[PTR13]], align 4
-; OPT:         [[PTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 1
-; OPT:         [[VAL2:%.*]] = load i32, ptr addrspace(101) [[PTR22]], align 4
-; OPT:         [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
-; OPT:         store ptr [[INPUT1]]
-; OPT:         [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
-; OPT:         [[PTR1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[PTR13]])
-; OPT:         [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
-;
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr
+; OPT-NEXT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
+; OPT-NEXT:    [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4
+; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
+; OPT-NEXT:    [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4
+; OPT-NEXT:    store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8
+; OPT-NEXT:    [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
+; OPT-NEXT:    [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
+; OPT-NEXT:    ret i32 [[ADD]]
   %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
   %val1 = load i32, ptr %ptr1
   %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
@@ -194,29 +315,48 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu
   ret i32 %add
 }
 
-define void @grid_const_phi_escape(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
-; PTX-LABEL: grid_const_phi_escape(
-; PTX:         cvta.param.{{.*}} [[RD1:%.*]], {{.*}}
-; PTX:         @[[P1:%.*]] bra $L__BB[[TARGET_LABEL:[_0-9]+]];
-; PTX:        $L__BB[[TARGET_LABEL]]:
-; PTX:         ld.{{.*}} [[R1:%.*]], [[[RD1]]];
-;
-; OPT-LABEL: define void @grid_const_phi_escape(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr {{%.*}}) {
-; OPT:         [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT:         [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
-; OPT:         br i1 {{.*}}, label %[[FIRST:.*]], label %[[SECOND:.*]]
+define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
+; PTX-LABEL: grid_const_phi(
+; PTX:       {
+; PTX-NEXT:    .reg .pred %p<2>;
+; PTX-NEXT:    .reg .b32 %r<3>;
+; PTX-NEXT:    .reg .b64 %rd<9>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd5, grid_const_phi_param_0;
+; PTX-NEXT:    ld.param.u64 %rd6, [grid_const_phi_param_1];
+; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd6;
+; PTX-NEXT:    mov.u64 %rd7, %rd5;
+; PTX-NEXT:    cvta.param.u64 %rd8, %rd7;
+; PTX-NEXT:    ld.global.u32 %r1, [%rd1];
+; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT:    @%p1 bra $L__BB8_2;
+; PTX-NEXT:  // %bb.1: // %second
+; PTX-NEXT:    add.s64 %rd8, %rd8, 4;
+; PTX-NEXT:  $L__BB8_2: // %merge
+; PTX-NEXT:    ld.u32 %r2, [%rd8];
+; PTX-NEXT:    st.global.u32 [%rd1], %r2;
+; PTX-NEXT:    ret;
+; OPT-LABEL: define void @grid_const_phi(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT:    [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT:    br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
 ; OPT:       [[FIRST]]:
-; OPT:         [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
-; OPT:         br label %[[MERGE:.*]]
+; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; OPT-NEXT:    br label %[[MERGE:.*]]
 ; OPT:       [[SECOND]]:
-; OPT:         [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
-; OPT:         br label %[[MERGE]]
+; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
+; OPT-NEXT:    br label %[[MERGE]]
 ; OPT:       [[MERGE]]:
-; OPT:         [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
-; OPT-NOT:     load i32, ptr addrspace(101)
-; OPT:         [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-;
+; OPT-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT:    ret void
 
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
@@ -235,32 +375,53 @@ merge:
 }
 
 ; NOTE: %input2 is *not* grid_constant
-define void @grid_const_phi_escape2(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
-; PTX-LABEL: grid_const_phi_escape2(
-; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_phi_escape2_param_1+4];
-; PTX:         @[[P1:%.*]] bra $L__BB[[LABEL:[_0-9]+]];
-; PTX:         cvta.param.u64 [[RD1:%.*]], [[RD2:%.*]];
-; PTX:         ld.u32 [[R1]], [[[RD1]]];
-; PTX:       $L__BB[[LABEL]]:
-; PTX:         st.global.u32 [[[RD3:%.*]]], [[R1]]
-; OPT-LABEL: define void @grid_const_phi_escape2(
-; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr {{%.*}}) {
-; OPT:         [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8
-; OPT:         [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
-; OPT:         [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8
-; OPT:         store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4
-; OPT:         [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT:         [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
-; OPT:         br i1 [[LESS:%.*]], label %[[FIRST:.*]], label %[[SECOND:.*]]
+define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_phi_ngc(
+; PTX:       {
+; PTX-NEXT:    .reg .pred %p<2>;
+; PTX-NEXT:    .reg .b32 %r<3>;
+; PTX-NEXT:    .reg .b64 %rd<12>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd6, grid_const_phi_ngc_param_0;
+; PTX-NEXT:    ld.param.u64 %rd7, [grid_const_phi_ngc_param_2];
+; PTX-NEXT:    cvta.to.global.u64 %rd1, %rd7;
+; PTX-NEXT:    mov.u64 %rd10, %rd6;
+; PTX-NEXT:    cvta.param.u64 %rd11, %rd10;
+; PTX-NEXT:    ld.global.u32 %r1, [%rd1];
+; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT:    @%p1 bra $L__BB9_2;
+; PTX-NEXT:  // %bb.1: // %second
+; PTX-NEXT:    mov.b64 %rd8, grid_const_phi_ngc_param_1;
+; PTX-NEXT:    mov.u64 %rd9, %rd8;
+; PTX-NEXT:    cvta.param.u64 %rd2, %rd9;
+; PTX-NEXT:    add.s64 %rd11, %rd2, 4;
+; PTX-NEXT:  $L__BB9_2: // %merge
+; PTX-NEXT:    ld.u32 %r2, [%rd11];
+; PTX-NEXT:    st.global.u32 [%rd1], %r2;
+; PTX-NEXT:    ret;
+; OPT-LABEL: define void @grid_const_phi_ngc(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT:    [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; OPT-NEXT:    [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT:    br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
 ; OPT:       [[FIRST]]:
-; OPT:         [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0
-; OPT:         br label %[[MERGE:.*]]
+; OPT-NEXT:    [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
+; OPT-NEXT:    br label %[[MERGE:.*]]
 ; OPT:       [[SECOND]]:
-; OPT:         [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1
-; OPT:         br label %[[MERGE]]
+; OPT-NEXT:    [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1
+; OPT-NEXT:    br label %[[MERGE]]
 ; OPT:       [[MERGE]]:
-; OPT:         [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
-;
+; OPT-NEXT:    [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
+; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT:    ret void
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
   br i1 %less, label %first, label %second
@@ -278,22 +439,42 @@ merge:
 }
 
 ; NOTE: %input2 is *not* grid_constant
-define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
-; PTX-LABEL: grid_const_select_escape(
-; PTX:         cvta.param.{{.*}} [[RD2:%.*]], [[RD1:%.*]]
-; PTX:         setp.lt.{{.*}} [[P1:%.*]], {{%.*}}, 0
-; PTX:         add.{{.*}} [[RD3:%.*]], %SP, 0;
-; PTX:         selp.{{.*}} [[RD4:%.*]], [[RD2]], [[RD3]], [[P1]];
-; PTX:         ld.u32 {{%.*}}, [[[RD4]]];
-; OPT-LABEL: define void @grid_const_select_escape(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) {
-; OPT:         [[INPUT24:%.*]] = alloca i32, align 4
-; OPT:         [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
-; OPT:         [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]])
-; OPT:         load i32, ptr [[INOUT]]
-; OPT:         [[PTRNEW:%.*]] = select i1 [[LESS:%.*]], ptr [[INPUT11]], ptr [[INPUT24]]
-; OPT:         [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
-;
+define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
+; PTX-LABEL: grid_const_select(
+; PTX:       {
+; PTX-NEXT:    .reg .pred %p<2>;
+; PTX-NEXT:    .reg .b32 %r<3>;
+; PTX-NEXT:    .reg .b64 %rd<10>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_select_param_0;
+; PTX-NEXT:    ld.param.u64 %rd2, [grid_const_select_param_2];
+; PTX-NEXT:    cvta.to.global.u64 %rd3, %rd2;
+; PTX-NEXT:    mov.b64 %rd4, grid_const_select_param_1;
+; PTX-NEXT:    mov.u64 %rd5, %rd4;
+; PTX-NEXT:    cvta.param.u64 %rd6, %rd5;
+; PTX-NEXT:    mov.u64 %rd7, %rd1;
+; PTX-NEXT:    cvta.param.u64 %rd8, %rd7;
+; PTX-NEXT:    ld.global.u32 %r1, [%rd3];
+; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
+; PTX-NEXT:    selp.b64 %rd9, %rd8, %rd6, %p1;
+; PTX-NEXT:    ld.u32 %r2, [%rd9];
+; PTX-NEXT:    st.global.u32 [%rd3], %r2;
+; PTX-NEXT:    ret;
+; OPT-LABEL: define void @grid_const_select(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1)
+; OPT-NEXT:    [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr
+; OPT-NEXT:    [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]])
+; OPT-NEXT:    [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]])
+; OPT-NEXT:    [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4
+; OPT-NEXT:    [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
+; OPT-NEXT:    [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
+; OPT-NEXT:    [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
+; OPT-NEXT:    store i32 [[VALLOADED]], ptr [[INOUT2]], align 4
+; OPT-NEXT:    ret void
   %val = load i32, ptr %inout
   %less = icmp slt i32 %val, 0
   %ptrnew = select i1 %less, ptr %input1, ptr %input2
@@ -304,16 +485,27 @@ define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(
 
 define i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
 ; PTX-LABEL: grid_const_ptrtoint(
-; PTX-NOT:     .local
-; PTX:         ld.param.{{.*}} {{%.*}}, [grid_const_ptrtoint_param_0];
-; PTX:         cvta.param.u64 [[RD1:%.*]], {{%.*}}
-; PTX:         cvt.u32.u64 {{%.*}}, [[RD1]]
+; PTX:       {
+; PTX-NEXT:    .reg .b32 %r<4>;
+; PTX-NEXT:    .reg .b64 %rd<4>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0:
+; PTX-NEXT:    mov.b64 %rd1, grid_const_ptrtoint_param_0;
+; PTX-NEXT:    mov.u64 %rd2, %rd1;
+; PTX-NEXT:    ld.param.u32 %r1, [grid_const_ptrtoint_param_0];
+; PTX-NEXT:    cvta.param.u64 %rd3, %rd2;
+; PTX-NEXT:    cvt.u32.u64 %r2, %rd3;
+; PTX-NEXT:    add.s32 %r3, %r1, %r2;
+; PTX-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; PTX-NEXT:    ret;
 ; OPT-LABEL: define i32 @grid_const_ptrtoint(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) {
-; OPT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
-; OPT:    [[VAL:%.*]]   = load i32, ptr addrspace(101) [[INPUT2]]
-; OPT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
-; OPT:    [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
+; OPT-NEXT:    [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT-NEXT:    [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
+; OPT-NEXT:    [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]])
+; OPT-NEXT:    [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
+; OPT-NEXT:    [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
+; OPT-NEXT:    ret i32 [[KEEPALIVE]]
   %val = load i32, ptr %input
   %ptrval = ptrtoint ptr %input to i32
   %keepalive = add i32 %val, %ptrval
@@ -352,13 +544,13 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
 !14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15}
 !15 = !{i32 1}
 
-!16 = !{ptr @grid_const_phi_escape, !"kernel", i32 1, !"grid_constant", !17}
+!16 = !{ptr @grid_const_phi, !"kernel", i32 1, !"grid_constant", !17}
 !17 = !{i32 1}
 
-!18 = !{ptr @grid_const_phi_escape2, !"kernel", i32 1, !"grid_constant", !19}
+!18 = !{ptr @grid_const_phi_ngc, !"kernel", i32 1, !"grid_constant", !19}
 !19 = !{i32 1}
 
-!20 = !{ptr @grid_const_select_escape, !"kernel", i32 1, !"grid_constant", !21}
+!20 = !{ptr @grid_const_select, !"kernel", i32 1, !"grid_constant", !21}
 !21 = !{i32 1}
 
 !22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23}
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index f041f202777f61..7aec67a2ea628c 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -1,166 +1,300 @@
-; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32
-; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
-
-%struct.ham = type { [4 x i32] }
-
-; // Verify that load with static offset into parameter is done directly.
-; CHECK-LABEL: .visible .entry static_offset
-; CHECK-NOT:   .local
-; CHECK64: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
-; CHECK64: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-;
-; CHECK32: ld.param.u32    [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: mov.b32         %[[param_addr:r[0-9]+]], {{.*}}_param_1
-; CHECK32: mov.u32         %[[param_addr1:r[0-9]+]], %[[param_addr]]
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-;
-; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_addr1]]+12];
-; CHECK: st.global.u32   [[[result_addr_g]]], [[value]];
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @static_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
-bb:
-  %tmp = icmp eq i32 %arg2, 3
-  br i1 %tmp, label %bb3, label %bb6
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple nvptx -mcpu=sm_70 -nvptx-lower-args -S | FileCheck %s --check-prefixes=CHECK,CHECK32
+source_filename = "<stdin>"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.S = type { i32, i32 }
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+declare dso_local void @_Z6escapePv(ptr noundef) local_unnamed_addr #0
 
-bb3:                                              ; preds = %bb
-  %tmp4 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 3
-  %tmp5 = load i32, ptr %tmp4, align 4
-  store i32 %tmp5, ptr %arg, align 4
-  br label %bb6
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+declare dso_local void @_Z6escapei(i32 noundef) local_unnamed_addr #0
 
-bb6:                                              ; preds = %bb3, %bb
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
+; CHECK-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %i = load i32, ptr %s, align 4
+  store i32 %i, ptr %out, align 4
   ret void
 }
 
-; // Verify that load with dynamic offset into parameter is also done directly.
-; CHECK-LABEL: .visible .entry dynamic_offset
-; CHECK-NOT:   .local
-; CHECK64: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: mov.b64         %[[param_addr:rd[0-9]+]], {{.*}}_param_1
-; CHECK64: mov.u64         %[[param_addr1:rd[0-9]+]], %[[param_addr]]
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-; CHECK64: add.s64         %[[param_w_offset:rd[0-9]+]], %[[param_addr1]],
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only_gep(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; CHECK-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT:    ret void
 ;
-; CHECK32: ld.param.u32    [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: mov.b32         %[[param_addr:r[0-9]+]], {{.*}}_param_1
-; CHECK32: mov.u32         %[[param_addr1:r[0-9]+]], %[[param_addr]]
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-; CHECK32: add.s32         %[[param_w_offset:r[0-9]+]], %[[param_addr1]],
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  %i = load i32, ptr %b, align 4
+  store i32 %i, ptr %out, align 4
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only_gep_asc(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
+; CHECK-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT:    ret void
 ;
-; CHECK: ld.param.u32    [[value:%r[0-9]+]], [%[[param_w_offset]]];
-; CHECK: st.global.u32   [[[result_addr_g]]], [[value]];
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  %asc = addrspacecast ptr %b to ptr addrspace(101)
+  %i = load i32, ptr addrspace(101) %asc, align 4
+  store i32 %i, ptr %out, align 4
+  ret void
+}
 
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 {
-bb:
-  %tmp = sext i32 %arg2 to i64
-  %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
-  %tmp4 = load i32, ptr %tmp3, align 4
-  store i32 %tmp4, ptr %arg, align 4
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @read_only_gep_asc0(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; CHECK-NEXT:    [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; CHECK-NEXT:    [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr [[ASC0]], align 4
+; CHECK-NEXT:    store i32 [[I]], ptr [[OUT2]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  %asc = addrspacecast ptr %b to ptr addrspace(101)
+  %asc0 = addrspacecast ptr addrspace(101) %asc to ptr
+  %i = load i32, ptr %asc0, align 4
+  store i32 %i, ptr %out, align 4
   ret void
 }
 
-; Same as above, but with a bitcast present in the chain
-; CHECK-LABEL:.visible .entry gep_bitcast
-; CHECK-NOT: .local
-; CHECK64-DAG: ld.param.u64    [[out:%rd[0-9]+]], [gep_bitcast_param_0]
-; CHECK64-DAG: mov.b64         {{%rd[0-9]+}}, gep_bitcast_param_1
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr(
+; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[S3]]) #[[ATTR0]]
+; CHECK-NEXT:    ret void
 ;
-; CHECK32-DAG: ld.param.u32    [[out:%r[0-9]+]], [gep_bitcast_param_0]
-; CHECK32-DAG: mov.b32         {{%r[0-9]+}}, gep_bitcast_param_1
+entry:
+  call void @_Z6escapePv(ptr noundef nonnull %s) #0
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr_gep(
+; CHECK-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; CHECK-NEXT:    call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR0]]
+; CHECK-NEXT:    ret void
 ;
-; CHECK-DAG: ld.param.u32    {{%r[0-9]+}}, [gep_bitcast_param_2]
-; CHECK64:     ld.param.u8     [[value:%rs[0-9]+]], [{{%rd[0-9]+}}]
-; CHECK64:     st.global.u8    [{{%rd[0-9]+}}], [[value]];
-; CHECK32:     ld.param.u8     [[value:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK32:     st.global.u8    [{{%r[0-9]+}}], [[value]];
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  call void @_Z6escapePv(ptr noundef nonnull %b) #0
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr_store(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[I:%.*]] = ptrtoint ptr [[S3]] to i64
+; CHECK-NEXT:    store i64 [[I]], ptr [[OUT2]], align 8
+; CHECK-NEXT:    ret void
 ;
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @gep_bitcast(ptr nocapture %out,  ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
-bb:
-  %n64 = sext i32 %n to i64
-  %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
-  %load = load i8, ptr %gep, align 4
-  store i8 %load, ptr %out, align 4
+entry:
+  %i = ptrtoint ptr %s to i64
+  store i64 %i, ptr %out, align 8
   ret void
 }
 
-; Same as above, but with an ASC(101) present in the chain
-; CHECK-LABEL:.visible .entry gep_bitcast_asc
-; CHECK-NOT: .local
-; CHECK64-DAG: ld.param.u64    [[out:%rd[0-9]+]], [gep_bitcast_asc_param_0]
-; CHECK64-DAG: mov.b64         {{%rd[0-9]+}}, gep_bitcast_asc_param_1
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_ptr_gep_store(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4
+; CHECK-NEXT:    [[I:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    store i64 [[I]], ptr [[OUT2]], align 8
+; CHECK-NEXT:    ret void
 ;
-; CHECK32-DAG: ld.param.u32    [[out:%r[0-9]+]], [gep_bitcast_asc_param_0]
-; CHECK32-DAG: mov.b32         {{%r[0-9]+}}, gep_bitcast_asc_param_1
+entry:
+  %b = getelementptr inbounds nuw i8, ptr %s, i64 4
+  %i = ptrtoint ptr %b to i64
+  store i64 %i, ptr %out, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @escape_math_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @escape_math_store(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    [[I:%.*]] = ptrtoint ptr [[S3]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = or disjoint i64 [[I]], 1
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[OUT2]], align 8
+; CHECK-NEXT:    ret void
 ;
-; CHECK-DAG: ld.param.u32    {{%r[0-9]+}}, [gep_bitcast_asc_param_2]
-; CHECK64:     ld.param.u8     [[value:%rs[0-9]+]], [{{%rd[0-9]+}}]
-; CHECK64:     st.global.u8    [{{%rd[0-9]+}}], [[value]];
-; CHECK32:     ld.param.u8     [[value:%rs[0-9]+]], [{{%r[0-9]+}}]
-; CHECK32:     st.global.u8    [{{%r[0-9]+}}], [[value]];
+entry:
+  %i = ptrtoint ptr %s to i64
+  %add = or disjoint i64 %i, 1
+  store i64 %add, ptr %out, align 8
+  ret void
+}
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @memcpy_from_param(
+; CHECK-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; CHECK-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S4]], i64 16, i1 true)
+; CHECK-NEXT:    ret void
 ;
-; Function Attrs: nofree norecurse nounwind willreturn mustprogress
-define dso_local void @gep_bitcast_asc(ptr nocapture %out,  ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 {
-bb:
-  %n64 = sext i32 %n to i64
-  %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64
-  %asc = addrspacecast ptr %gep to ptr addrspace(101)
-  %load = load i8, ptr addrspace(101) %asc, align 4
-  store i8 %load, ptr %out, align 4
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
   ret void
 }
 
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @memcpy_to_param(
+; CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
+; CHECK-NEXT:    [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true)
+  ret void
+}
 
-; Verify that if the pointer escapes, then we do fall back onto using a temp copy.
-; CHECK-LABEL: .visible .entry pointer_escapes
-; CHECK: .local .align 4 .b8     __local_depot{{.*}}
-; CHECK64: ld.param.u64    [[result_addr:%rd[0-9]+]], [{{.*}}_param_0]
-; CHECK64: add.u64         %[[copy_addr:rd[0-9]+]], %SPL, 0;
-; CHECK32: ld.param.u32    [[result_addr:%r[0-9]+]], [{{.*}}_param_0]
-; CHECK32: add.u32         %[[copy_addr:r[0-9]+]], %SPL, 0;
-; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+12];
-; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+8];
-; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1+4];
-; CHECK-DAG: ld.param.u32    %{{.*}}, [pointer_escapes_param_1];
-; CHECK-DAG: st.local.u32    [%[[copy_addr]]+12],
-; CHECK-DAG: st.local.u32    [%[[copy_addr]]+8],
-; CHECK-DAG: st.local.u32    [%[[copy_addr]]+4],
-; CHECK-DAG: st.local.u32    [%[[copy_addr]]],
-; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]]
-; CHECK64: add.s64         %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]],
-; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]]
-; CHECK32: add.s32         %[[copy_w_offset:r[0-9]+]], %[[copy_addr]],
-; CHECK: ld.local.u32    [[value:%r[0-9]+]], [%[[copy_w_offset]]];
-; CHECK: st.global.u32   [[[result_addr_g]]], [[value]];
-
-; Function Attrs: convergent norecurse nounwind mustprogress
-define dso_local void @pointer_escapes(ptr nocapture %arg, ptr byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 {
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 {
+; CHECK-LABEL: define dso_local void @copy_on_store(
+; CHECK-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[S3:%.*]] = alloca [[STRUCT_S]], align 4
+; CHECK-NEXT:    [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; CHECK-NEXT:    [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4
+; CHECK-NEXT:    store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4
+; CHECK-NEXT:    [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1)
+; CHECK-NEXT:    [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr [[IN2]], align 4
+; CHECK-NEXT:    store i32 [[I]], ptr [[S3]], align 4
+; CHECK-NEXT:    ret void
+;
 bb:
-  %tmp = sext i32 %arg2 to i64
-  %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp
-  %tmp4 = load i32, ptr %tmp3, align 4
-  store i32 %tmp4, ptr %arg, align 4
-  %tmp5 = call ptr @escape(ptr nonnull %tmp3) #3
+  %i = load i32, ptr %in, align 4
+  store i32 %i, ptr %s, align 4
   ret void
 }
 
-; Function Attrs: convergent nounwind
-declare dso_local ptr @escape(ptr) local_unnamed_addr
-
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
 
-!llvm.module.flags = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4, !5, !6, !7}
+!llvm.module.flags = !{!0, !1, !2, !3}
+!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15}
+!llvm.ident = !{!16, !17}
 
-!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]}
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]}
 !1 = !{i32 1, !"wchar_size", i32 4}
 !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
-!3 = !{ptr @static_offset, !"kernel", i32 1}
-!4 = !{ptr @dynamic_offset, !"kernel", i32 1}
-!5 = !{ptr @pointer_escapes, !"kernel", i32 1}
-!6 = !{ptr @gep_bitcast, !"kernel", i32 1}
-!7 = !{ptr @gep_bitcast_asc, !"kernel", i32 1}
+!3 = !{i32 7, !"frame-pointer", i32 2}
+!4 = !{ptr @read_only, !"kernel", i32 1}
+!5 = !{ptr @escape_ptr, !"kernel", i32 1}
+!6 = !{ptr @escape_ptr_gep, !"kernel", i32 1}
+!7 = !{ptr @escape_ptr_store, !"kernel", i32 1}
+!8 = !{ptr @escape_ptr_gep_store, !"kernel", i32 1}
+!9 = !{ptr @escape_math_store, !"kernel", i32 1}
+!10 = !{ptr @memcpy_from_param, !"kernel", i32 1}
+!11 = !{ptr @memcpy_to_param, !"kernel", i32 1}
+!12 = !{ptr @copy_on_store, !"kernel", i32 1}
+!13 = !{ptr @read_only_gep, !"kernel", i32 1}
+!14 = !{ptr @read_only_gep_asc, !"kernel", i32 1}
+!15 = !{ptr @read_only_gep_asc0, !"kernel", i32 1}
+!16 = !{!"clang version 20.0.0git"}
+!17 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK32: {{.*}}

>From d207ec4aebfff8b5555736cfdf138bf57c1f8edd Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra at google.com>
Date: Wed, 28 Aug 2024 11:47:05 -0700
Subject: [PATCH 3/3] fixups

---
 llvm/include/llvm/Analysis/PtrUseVisitor.h | 2 +-
 llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp   | 2 +-
 llvm/lib/Target/NVPTX/NVPTXSubtarget.h     | 4 +---
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index 539d302bb70a1b..7ae03b4a7716a1 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -208,7 +208,7 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
 
   /// Recursively visit the uses of the given pointer.
   /// \returns An info struct about the pointer. See \c PtrInfo for details.
-  /// We may also need to process Argument pointers, so the input uses is 
+  /// We may also need to process Argument pointers, so the input uses is
   /// a common Value type.
   PtrInfo visitPtr(Value &I) {
     // This must be a pointer type. Get an integer type suitable to hold
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 243f39d8a16719..a79dd23abeec62 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -507,7 +507,7 @@ struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
     // Storing the pointer escapes it.
     if (U->get() == SI.getValueOperand())
       return PI.setEscapedAndAborted(&SI);
-    // Writes to the pointer are UB w/ __gid_constant__, but do not force a
+    // Writes to the pointer are UB w/ __grid_constant__, but do not force a
     // copy.
     if (!IsGridConstant)
       return PI.setAborted(&SI);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 38b5ee63f4b2b0..ed5d120902d2a0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -93,9 +93,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   bool hasDotInstructions() const {
     return SmVersion >= 61 && PTXVersion >= 50;
   }
-  bool hasCvtaParam() const {
-    return SmVersion >= 70 && PTXVersion >= 77;
-  }
+  bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
   unsigned int getFullSmVersion() const { return FullSmVersion; }
   unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
   // GPUs with "a" suffix have include architecture-accelerated features that



More information about the llvm-commits mailing list