[llvm] Add support for "grid_constant" in NVPTXLowerArgs. (PR #96125)

Wed Jun 19 17:33:48 PDT 2024

https://github.com/akshayrdeodhar created https://github.com/llvm/llvm-project/pull/96125

- Adds a helper function for checking whether an argument is a grid_constant.
- Adds support for cvta.param using changes from https://github.com/llvm/llvm-project/pull/95289
- Supports escaped grid_constant pointers conservatively, by casting all uses to the generic address space with cvta.param.

>From 8a23d73910c55f026a374db0d29708f55a526a43 Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Mon, 17 Jun 2024 22:45:13 +0000
Subject: [PATCH] Adds support for "grid_constant" in NVPTXLowerArgs.

- Adds a helper function for checking whether an argument is a grid_constant.
- Adds support for cvta.param using changes from https://github.com/llvm/llvm-project/pull/95289
- Supports escaped grid_constant pointers conservatively, by casting all uses to the generic address space with cvta.param.
---
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |   5 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |   1 +
 llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp      |  73 ++++++---
 llvm/lib/Target/NVPTX/NVPTXUtilities.cpp      | 143 ++++++++--------
 llvm/lib/Target/NVPTX/NVPTXUtilities.h        |   1 +
 .../CodeGen/NVPTX/lower-args-gridconstant.ll  | 155 ++++++++++++++++++
 6 files changed, 292 insertions(+), 86 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 0a9139e0062ba..b7c828566e375 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1596,6 +1596,11 @@ def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
                                    [IntrNoMem, IntrSpeculatable, IntrNoCallback],
                                    "llvm.nvvm.ptr.gen.to.param">;
 
+// sm70+, PTX7.7+
+def int_nvvm_ptr_param_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
+                                     [llvm_anyptr_ty],
+                                   [IntrNoMem, IntrSpeculatable, IntrNoCallback]>;
+
 // Move intrinsics, used in nvvm internally
 
 def int_nvvm_move_i16 : Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem],
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index a65170e56aa24..3e7f8d63439c8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2475,6 +2475,7 @@ defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen, useShortPtrLocal>
 defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen, useShortPtrShared>;
 defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen, False>;
 defm cvta_const  : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen, useShortPtrConst>;
+defm cvta_param : NG_TO_G<"param", int_nvvm_ptr_param_to_gen, False>;
 
 defm cvta_to_local  : G_TO_NG<"local", int_nvvm_ptr_gen_to_local, useShortPtrLocal>;
 defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared, useShortPtrShared>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index cde02c25c4834..1116b8e6313f7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -95,7 +95,9 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
@@ -336,8 +338,9 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
     while (!ValuesToCheck.empty()) {
       Value *V = ValuesToCheck.pop_back_val();
       if (!IsALoadChainInstr(V)) {
-        LLVM_DEBUG(dbgs() << "Need a copy of " << *Arg << " because of " << *V
-                          << "\n");
+        LLVM_DEBUG(dbgs() << "Need a "
+                          << (isParamGridConstant(*Arg) ? "cast " : "copy ")
+                          << "of " << *Arg << " because of " << *V << "\n");
         (void)Arg;
         return false;
       }
@@ -366,27 +369,55 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
     return;
   }
 
-  // Otherwise we have to create a temporary copy.
   const DataLayout &DL = Func->getParent()->getDataLayout();
   unsigned AS = DL.getAllocaAddrSpace();
-  AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
-  // Set the alignment to alignment of the byval parameter. This is because,
-  // later load/stores assume that alignment, and we are going to replace
-  // the use of the byval parameter with this alloca instruction.
-  AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo())
-                           .value_or(DL.getPrefTypeAlign(StructType)));
-  Arg->replaceAllUsesWith(AllocA);
-
-  Value *ArgInParam = new AddrSpaceCastInst(
-      Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(),
-      FirstInst);
-  // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
-  // addrspacecast preserves alignment.  Since params are constant, this load is
-  // definitely not volatile.
-  LoadInst *LI =
-      new LoadInst(StructType, ArgInParam, Arg->getName(),
-                   /*isVolatile=*/false, AllocA->getAlign(), FirstInst);
-  new StoreInst(LI, AllocA, FirstInst);
+  if (isParamGridConstant(*Arg)) {
+    // Writes to a grid constant are undefined behaviour. We do not need a
+    // temporary copy. When a pointer might have escaped, conservatively replace
+    // all of its uses (which might include a device function call) with a cast
+    // to the generic address space.
+    // TODO: only cast byval grid constant parameters at use points that need
+    // generic address (e.g., merging parameter pointers with other address
+    // space, or escaping to call-sites, inline-asm, memory), and use the
+    // parameter address space for normal loads.
+    IRBuilder<> IRB(&Func->getEntryBlock().front());
+
+    // Cast argument to param address space
+    AddrSpaceCastInst *CastToParam =
+        cast<AddrSpaceCastInst>(IRB.CreateAddrSpaceCast(
+            Arg, IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getName() + ".param"));
+
+    // Cast param address to generic address space
+    Value *CvtToGenCall = IRB.CreateIntrinsic(
+        IRB.getPtrTy(ADDRESS_SPACE_GENERIC), Intrinsic::nvvm_ptr_param_to_gen,
+        CastToParam, nullptr, CastToParam->getName() + ".gen");
+
+    Arg->replaceAllUsesWith(CvtToGenCall);
+
+    // Do not replace Arg in the cast to param space
+    CastToParam->setOperand(0, Arg);
+  } else {
+    // Otherwise we have to create a temporary copy.
+    AllocaInst *AllocA =
+        new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
+    // Set the alignment to alignment of the byval parameter. This is because,
+    // later load/stores assume that alignment, and we are going to replace
+    // the use of the byval parameter with this alloca instruction.
+    AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo())
+                             .value_or(DL.getPrefTypeAlign(StructType)));
+    Arg->replaceAllUsesWith(AllocA);
+
+    Value *ArgInParam = new AddrSpaceCastInst(
+        Arg, PointerType::get(Arg->getContext(), ADDRESS_SPACE_PARAM),
+        Arg->getName(), FirstInst);
+    // Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
+    // addrspacecast preserves alignment.  Since params are constant, this load
+    // is definitely not volatile.
+    LoadInst *LI =
+        new LoadInst(StructType, ArgInParam, Arg->getName(),
+                     /*isVolatile=*/false, AllocA->getAlign(), FirstInst);
+    new StoreInst(LI, AllocA, FirstInst);
+  }
 }
 
 void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 3a536db1c9727..96db2079ed59f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -52,29 +52,45 @@ void clearAnnotationCache(const Module *Mod) {
   AC.Cache.erase(Mod);
 }
 
-static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+static void readIntVecFromMDNode(const MDNode *MetadataNode,
+                                 std::vector<unsigned> &Vec) {
+  for (unsigned i = 0, e = MetadataNode->getNumOperands(); i != e; ++i) {
+    ConstantInt *Val =
+        mdconst::extract<ConstantInt>(MetadataNode->getOperand(i));
+    Vec.push_back(Val->getZExtValue());
+  }
+}
+
+static void cacheAnnotationFromMD(const MDNode *MetadataNode,
+                                  key_val_pair_t &retval) {
   auto &AC = getAnnotationCache();
   std::lock_guard<sys::Mutex> Guard(AC.Lock);
-  assert(md && "Invalid mdnode for annotation");
-  assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
+  assert(MetadataNode && "Invalid mdnode for annotation");
+  assert((MetadataNode->getNumOperands() % 2) == 1 &&
+         "Invalid number of operands");
   // start index = 1, to skip the global variable key
   // increment = 2, to skip the value for each property-value pairs
-  for (unsigned i = 1, e = md->getNumOperands(); i != e; i += 2) {
+  for (unsigned i = 1, e = MetadataNode->getNumOperands(); i != e; i += 2) {
     // property
-    const MDString *prop = dyn_cast<MDString>(md->getOperand(i));
+    const MDString *prop = dyn_cast<MDString>(MetadataNode->getOperand(i));
     assert(prop && "Annotation property not a string");
+    std::string Key = prop->getString().str();
 
     // value
-    ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(md->getOperand(i + 1));
-    assert(Val && "Value operand not a constant int");
-
-    std::string keyname = prop->getString().str();
-    if (retval.find(keyname) != retval.end())
-      retval[keyname].push_back(Val->getZExtValue());
-    else {
-      std::vector<unsigned> tmp;
-      tmp.push_back(Val->getZExtValue());
-      retval[keyname] = tmp;
+    if (ConstantInt *Val = mdconst::dyn_extract<ConstantInt>(
+            MetadataNode->getOperand(i + 1))) {
+      retval[Key].push_back(Val->getZExtValue());
+    } else if (MDNode *VecMd =
+                   dyn_cast<MDNode>(MetadataNode->getOperand(i + 1))) {
+      // assert: there can only exist one unique key value pair of
+      // the form (string key, MDNode node). Operands of such a node
+      // shall always be unsigned ints.
+      if (retval.find(Key) == retval.end()) {
+        readIntVecFromMDNode(VecMd, retval[Key]);
+        continue;
+      }
+    } else {
+      llvm_unreachable("Value operand not a constant int or an mdnode");
     }
   }
 }
@@ -153,9 +169,9 @@ bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
 
 bool isTexture(const Value &val) {
   if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
-    unsigned annot;
-    if (findOneNVVMAnnotation(gv, "texture", annot)) {
-      assert((annot == 1) && "Unexpected annotation on a texture symbol");
+    unsigned Annot;
+    if (findOneNVVMAnnotation(gv, "texture", Annot)) {
+      assert((Annot == 1) && "Unexpected annotation on a texture symbol");
       return true;
     }
   }
@@ -164,70 +180,68 @@ bool isTexture(const Value &val) {
 
 bool isSurface(const Value &val) {
   if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
-    unsigned annot;
-    if (findOneNVVMAnnotation(gv, "surface", annot)) {
-      assert((annot == 1) && "Unexpected annotation on a surface symbol");
+    unsigned Annot;
+    if (findOneNVVMAnnotation(gv, "surface", Annot)) {
+      assert((Annot == 1) && "Unexpected annotation on a surface symbol");
       return true;
     }
   }
   return false;
 }
 
-bool isSampler(const Value &val) {
-  const char *AnnotationName = "sampler";
-
-  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
-    unsigned annot;
-    if (findOneNVVMAnnotation(gv, AnnotationName, annot)) {
-      assert((annot == 1) && "Unexpected annotation on a sampler symbol");
-      return true;
-    }
-  }
-  if (const Argument *arg = dyn_cast<Argument>(&val)) {
-    const Function *func = arg->getParent();
-    std::vector<unsigned> annot;
-    if (findAllNVVMAnnotation(func, AnnotationName, annot)) {
-      if (is_contained(annot, arg->getArgNo()))
+static bool argHasNVVMAnnotation(const Value &Val,
+                                 const std::string &Annotation,
+                                 const bool StartArgIndexAtOne = false) {
+  if (const Argument *Arg = dyn_cast<Argument>(&Val)) {
+    const Function *Func = Arg->getParent();
+    std::vector<unsigned> Annot;
+    if (findAllNVVMAnnotation(Func, Annotation, Annot)) {
+      const unsigned BaseOffset = StartArgIndexAtOne ? 1 : 0;
+      if (is_contained(Annot, BaseOffset + Arg->getArgNo())) {
         return true;
+      }
     }
   }
   return false;
 }
 
-bool isImageReadOnly(const Value &val) {
-  if (const Argument *arg = dyn_cast<Argument>(&val)) {
-    const Function *func = arg->getParent();
-    std::vector<unsigned> annot;
-    if (findAllNVVMAnnotation(func, "rdoimage", annot)) {
-      if (is_contained(annot, arg->getArgNo()))
-        return true;
+bool isParamGridConstant(const Value &V) {
+  if (const Argument *Arg = dyn_cast<Argument>(&V)) {
+    std::vector<unsigned> Annot;
+    // "grid_constant" counts argument indices starting from 1
+    if (Arg->hasByValAttr() &&
+        argHasNVVMAnnotation(*Arg, "grid_constant", true)) {
+      assert(isKernelFunction(*Arg->getParent()) &&
+             "only kernel arguments can be grid_constant");
+      return true;
     }
   }
   return false;
 }
 
-bool isImageWriteOnly(const Value &val) {
-  if (const Argument *arg = dyn_cast<Argument>(&val)) {
-    const Function *func = arg->getParent();
-    std::vector<unsigned> annot;
-    if (findAllNVVMAnnotation(func, "wroimage", annot)) {
-      if (is_contained(annot, arg->getArgNo()))
-        return true;
+bool isSampler(const Value &val) {
+  const char *AnnotationName = "sampler";
+
+  if (const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned Annot;
+    if (findOneNVVMAnnotation(gv, AnnotationName, Annot)) {
+      assert((Annot == 1) && "Unexpected annotation on a sampler symbol");
+      return true;
     }
   }
-  return false;
+  return argHasNVVMAnnotation(val, AnnotationName);
+}
+
+bool isImageReadOnly(const Value &val) {
+  return argHasNVVMAnnotation(val, "rdoimage");
+}
+
+bool isImageWriteOnly(const Value &val) {
+  return argHasNVVMAnnotation(val, "wroimage");
 }
 
 bool isImageReadWrite(const Value &val) {
-  if (const Argument *arg = dyn_cast<Argument>(&val)) {
-    const Function *func = arg->getParent();
-    std::vector<unsigned> annot;
-    if (findAllNVVMAnnotation(func, "rdwrimage", annot)) {
-      if (is_contained(annot, arg->getArgNo()))
-        return true;
-    }
-  }
-  return false;
+  return argHasNVVMAnnotation(val, "rdwrimage");
 }
 
 bool isImage(const Value &val) {
@@ -236,9 +250,9 @@ bool isImage(const Value &val) {
 
 bool isManaged(const Value &val) {
   if(const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
-    unsigned annot;
-    if (findOneNVVMAnnotation(gv, "managed", annot)) {
-      assert((annot == 1) && "Unexpected annotation on a managed symbol");
+    unsigned Annot;
+    if (findOneNVVMAnnotation(gv, "managed", Annot)) {
+      assert((Annot == 1) && "Unexpected annotation on a managed symbol");
       return true;
     }
   }
@@ -323,8 +337,7 @@ bool getMaxNReg(const Function &F, unsigned &x) {
 
 bool isKernelFunction(const Function &F) {
   unsigned x = 0;
-  bool retval = findOneNVVMAnnotation(&F, "kernel", x);
-  if (!retval) {
+  if (!findOneNVVMAnnotation(&F, "kernel", x)) {
     // There is no NVVM metadata, check the calling convention
     return F.getCallingConv() == CallingConv::PTX_Kernel;
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index e020bc0f02e96..c15ff6cae1f27 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -62,6 +62,7 @@ bool getMaxClusterRank(const Function &, unsigned &);
 bool getMinCTASm(const Function &, unsigned &);
 bool getMaxNReg(const Function &, unsigned &);
 bool isKernelFunction(const Function &);
+bool isParamGridConstant(const Value &);
 
 MaybeAlign getAlign(const Function &, unsigned);
 MaybeAlign getAlign(const CallInst &, unsigned);
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
new file mode 100644
index 0000000000000..46f54e0e6f4d4
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT
+; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX
+
+define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
+; PTX-LABEL: grid_const_int(
+; PTX-NOT:     ld.u32
+; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0];
+; 
+; OPT-LABEL: define void @grid_const_int(
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) {
+; OPT-NOT:     alloca
+; OPT:         [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
+; OPT:         [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
+;
+  %tmp = load i32, ptr %input1, align 4
+  %add = add i32 %tmp, %input2
+  store i32 %add, ptr %out
+  ret void
+}
+
+%struct.s = type { i32, i32 }
+
+define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
+; PTX-LABEL: grid_const_struct(
+; PTX:       {
+; PTX-NOT:     ld.u32
+; PTX:         ld.param.{{.*}} [[R1:%.*]], [grid_const_struct_param_0];
+; PTX:         ld.param.{{.*}} [[R2:%.*]], [grid_const_struct_param_0+4];
+;
+; OPT-LABEL: define void @grid_const_struct(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) {
+; OPT-NOT:     alloca
+; OPT:         [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:         [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
+; OPT:         [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
+; OPT:         [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
+; OPT:         [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
+;
+  %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
+  %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
+  %int1 = load i32, ptr %gep1
+  %int2 = load i32, ptr %gep2
+  %add = add i32 %int1, %int2
+  store i32 %add, ptr %out
+  ret void
+}
+
+define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
+; PTX-LABEL: grid_const_escape(
+; PTX:       {
+; PTX-NOT:     .local
+; PTX:         cvta.param.{{.*}}
+; OPT-LABEL: define void @grid_const_escape(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) {
+; OPT-NOT:     alloca [[STRUCT_S]]
+; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT:         [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
+;
+  %call = call i32 @escape(ptr %input)
+  ret void
+}
+
+define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
+; PTX-LABEL: multiple_grid_const_escape(
+; PTX:         mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0;
+; PTX:         mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2;
+; PTX:         mov.{{.*}} [[RD3:%.*]], [[RD2]];
+; PTX:         cvta.param.{{.*}} [[RD4:%.*]], [[RD3]];
+; PTX:         mov.u64 [[RD5:%.*]], [[RD1]];
+; PTX:         cvta.param.{{.*}} [[RD6:%.*]], [[RD5]];
+; PTX:         {
+; PTX:         st.param.b64 [param0+0], [[RD6]];
+; PTX:         st.param.b64 [param2+0], [[RD4]];
+;
+; OPT-LABEL: define void @multiple_grid_const_escape(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) {
+; OPT-NOT:     alloca i32
+; OPT:         [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101)
+; OPT:         [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]])
+; OPT-NOT:     alloca [[STRUCT_S]]
+; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT:         [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, ptr [[B_PARAM_GEN]])
+;
+  %a.addr = alloca i32, align 4
+  store i32 %a, ptr %a.addr, align 4
+  %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
+  ret void
+}
+
+define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
+; PTX-LABEL: grid_const_memory_escape(
+; PTX-NOT:     .local
+; PTX:         mov.b64 [[RD1:%.*]], grid_const_memory_escape_param_0;
+; PTX:         cvta.param.u64 [[RD3:%.*]], [[RD2:%.*]];
+; PTX:         st.global.u64 [[[RD4:%.*]]], [[RD3]];
+;
+; OPT-LABEL: define void @grid_const_memory_escape(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) {
+; OPT-NOT:     alloca [[STRUCT_S]]
+; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT:         store ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, align 8
+;
+  store ptr %input, ptr %addr, align 8
+  ret void
+}
+
+define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
+; PTX-LABEL: grid_const_inlineasm_escape(
+; PTX-NOT      .local 
+; PTX:         cvta.param.u64 [[RD2:%.*]], {{.*}}
+; PTX:         add.{{.*}} [[RD3:%.*]], [[RD2]], 4;
+; PTX:         add.s64 [[RD1:%.*]], [[RD2]], [[RD3]];
+;
+; OPT-LABEL: define void @grid_const_inlineasm_escape(
+; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) {
+; OPT-NOT:     alloca [[STRUCT_S]]
+; OPT:         [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101)
+; OPT:         [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]])
+; OPT:         [[TMP:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT_PARAM_GEN]], i32 0, i32 0
+; OPT:         [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT_PARAM_GEN]], i32 0, i32 1
+; OPT:         [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
+;
+  %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
+  %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
+  %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
+  store i64 %1, ptr %result, align 8
+  ret void
+}
+
+declare dso_local ptr @escape(ptr) local_unnamed_addr
+declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
+
+!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11}
+
+!0 = !{ptr @grid_const_int, !"kernel", i32 1, !"grid_constant", !1}
+!1 = !{i32 1}
+
+!2 = !{ptr @grid_const_struct, !"kernel", i32 1, !"grid_constant", !3}
+!3 = !{i32 1}
+
+!4 = !{ptr @grid_const_escape, !"kernel", i32 1, !"grid_constant", !5}
+!5 = !{i32 1}
+
+!6 = !{ptr @multiple_grid_const_escape, !"kernel", i32 1, !"grid_constant", !7}
+!7 = !{i32 1, i32 3}
+
+!8 = !{ptr @grid_const_memory_escape, !"kernel", i32 1, !"grid_constant", !9}
+!9 = !{i32 1}
+
+!10 = !{ptr @grid_const_inlineasm_escape, !"kernel", i32 1, !"grid_constant", !11}
+!11 = !{i32 1}