[mlir][GPUDialect] Implement memory attributions for LaunchOp

Wed Apr 26 15:55:22 PDT 2023

Author: Fabian Mora
Date: 2023-04-26T17:53:18-05:00
New Revision: 54e96f4f975ec05f44d998cd13032b1b55dad8ea

URL: https://github.com/llvm/llvm-project/commit/54e96f4f975ec05f44d998cd13032b1b55dad8ea
DIFF: https://github.com/llvm/llvm-project/commit/54e96f4f975ec05f44d998cd13032b1b55dad8ea.diff

LOG: [mlir][GPUDialect] Implement memory attributions for LaunchOp

Currently memory attributions are not supported for gpu::LaunchOp, this patch implements memory attributions for gpu::LaunchOp and modifies the KernelOutlining pass to make the attributions available in GPUFuncOp.

Reviewed By: makslevental

Differential Revision: https://reviews.llvm.org/D147809




diff  --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 860e20720afd9..e67adbc73f929 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -536,12 +536,14 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     to the amount of dynamic shared memory a kernel's workgroup should be
     allocated; when this operand is not present, a zero size is assumed.
-    The body region has _twelve_ arguments, grouped as follows:
+    The body region has at least _twelve_ arguments, grouped as follows:
     -   three arguments that contain block identifiers along x,y,z dimensions;
     -   three arguments that contain thread identifiers along x,y,z dimensions;
     -   operands of the `gpu.launch` operation as is (i.e. the operands for
         grid and block sizes).
+    -   a variadic number of Workgroup memory attributions.
+    -   a variadic number of Private memory attributions.
@@ -550,8 +552,11 @@ def GPU_LaunchOp : GPU_Op<"launch", [
                              `block` `(` ssa-id-list `)` `in` ssa-reassignment
                              `threads` `(` ssa-id-list `)` `in` ssa-reassignment
                              (dynamic_shared_memory_size ssa-use)?
+                             memory-attribution
                              region attr-dict?
     ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
+    memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
+                           (`private` `(` ssa-id-and-type-list `)`)?
@@ -582,6 +587,18 @@ def GPU_LaunchOp : GPU_Op<"launch", [
       "some_op"(%bx, %tx) : (index, index) -> ()
       %3 = "memref.load"(%val1, %bx) : (memref<?xf32, 1>, index) -> f32
+    // Launch with memory attributions.
+    gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
+               threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5)
+               workgroup(%workgroup: memref<32xf32, 3>)
+               private(%private: memref<1xf32, 5>) {
+      // Block and thread identifiers, as well as block/grid sizes are
+      // immediately usable inside body region.
+      "some_op"(%bx, %tx) : (index, index) -> ()
+      // Assuming %val1 is defined outside the gpu.launch region.
+      %42 = load %workgroup[%bx] : memref<32xf32, 3>
+    }
     Rationale: using operation/block arguments gives analyses a clear way of
@@ -601,7 +618,9 @@ def GPU_LaunchOp : GPU_Op<"launch", [
       CArg<"Value", "nullptr">:$dynamicSharedMemorySize,
       CArg<"Type", "nullptr">:$asyncTokenType,
-      CArg<"ValueRange", "{}">:$asyncDependencies)>
+      CArg<"ValueRange", "{}">:$asyncDependencies,
+      CArg<"TypeRange", "{}">:$workgroupAttributions,
+      CArg<"TypeRange", "{}">:$privateAttributions)>
   let extraClassDeclaration = [{
@@ -632,6 +651,57 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     /// The number of region attributes containing the launch configuration,
     /// placed in the leading positions of the argument list.
     static constexpr unsigned kNumConfigRegionAttributes = 12;
+    /// Returns the keywords used in the custom syntax for this Op.
+    static StringRef getWorkgroupKeyword() { return "workgroup"; }
+    static StringRef getPrivateKeyword() { return "private"; }
+    /// Returns the number of buffers located in the workgroup memory.
+    unsigned getNumWorkgroupAttributions() {
+      auto attr = (*this)->getAttrOfType<IntegerAttr>(
+          getNumWorkgroupAttributionsAttrName());
+      return attr ? attr.getInt() : 0;
+    }
+    /// Returns a list of block arguments that correspond to buffers located in
+    /// the workgroup memory
+    ArrayRef<BlockArgument> getWorkgroupAttributions() {
+      auto begin =
+          std::next(getBody().args_begin(), kNumConfigRegionAttributes);
+      auto end = std::next(begin, getNumWorkgroupAttributions());
+      return {begin, end};
+    }
+    /// Adds a new block argument that corresponds to buffers located in
+    /// workgroup memory.
+    BlockArgument addWorkgroupAttribution(Type type, Location loc);
+    /// Returns the number of buffers located in the private memory.
+    unsigned getNumPrivateAttributions() {
+      return getBody().getNumArguments() - kNumConfigRegionAttributes -
+          getNumWorkgroupAttributions();
+    }
+    /// Returns a list of block arguments that correspond to buffers located in
+    /// the private memory.
+    ArrayRef<BlockArgument> getPrivateAttributions() {
+      // Buffers on the private memory always come after buffers on the workgroup
+      // memory.
+      auto begin =
+          std::next(getBody().args_begin(),
+                    kNumConfigRegionAttributes + getNumWorkgroupAttributions());
+      return {begin, getBody().args_end()};
+    }
+    /// Adds a new block argument that corresponds to buffers located in
+    /// private memory.
+    BlockArgument addPrivateAttribution(Type type, Location loc);
+    /// Returns the name of the attribute containing the number of buffers
+    /// located in the workgroup memory.
+    static StringRef getNumWorkgroupAttributionsAttrName() {
+      return "workgroup_attributions";
+    }
   let hasCanonicalizer = 1;

diff  --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index f9d929d163445..3ce6083c1f009 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -332,6 +332,60 @@ static void printAsyncDependencies(OpAsmPrinter &printer, Operation *op,
   printer << ']';
+// GPU Memory attributions functions shared by LaunchOp and GPUFuncOp.
+/// Parses a GPU function memory attribution.
+/// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
+///                        (`private` `(` ssa-id-and-type-list `)`)?
+/// Note that this function parses only one of the two similar parts, with the
+/// keyword provided as argument.
+static ParseResult
+parseAttributions(OpAsmParser &parser, StringRef keyword,
+                  SmallVectorImpl<OpAsmParser::Argument> &args) {
+  // If we could not parse the keyword, just assume empty list and succeed.
+  if (failed(parser.parseOptionalKeyword(keyword)))
+    return success();
+  return parser.parseArgumentList(args, OpAsmParser::Delimiter::Paren,
+                                  /*allowType=*/true);
+/// Prints a GPU function memory attribution.
+static void printAttributions(OpAsmPrinter &p, StringRef keyword,
+                              ArrayRef<BlockArgument> values) {
+  if (values.empty())
+    return;
+  p << ' ' << keyword << '(';
+  llvm::interleaveComma(
+      values, p, [&p](BlockArgument v) { p << v << " : " << v.getType(); });
+  p << ')';
+/// Verifies a GPU function memory attribution.
+static LogicalResult verifyAttributions(Operation *op,
+                                        ArrayRef<BlockArgument> attributions,
+                                        gpu::AddressSpace memorySpace) {
+  for (Value v : attributions) {
+    auto type = v.getType().dyn_cast<MemRefType>();
+    if (!type)
+      return op->emitOpError() << "expected memref type in attribution";
+    // We can only verify the address space if it hasn't already been lowered
+    // from the AddressSpaceAttr to a target-specific numeric value.
+    auto addressSpace =
+        type.getMemorySpace().dyn_cast_or_null<gpu::AddressSpaceAttr>();
+    if (!addressSpace)
+      continue;
+    if (addressSpace.getValue() != memorySpace)
+      return op->emitOpError()
+             << "expected memory space " << stringifyAddressSpace(memorySpace)
+             << " in attribution";
+  }
+  return success();
 // AllReduceOp
@@ -439,7 +493,15 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
                      Value gridSizeX, Value gridSizeY, Value gridSizeZ,
                      Value getBlockSizeX, Value getBlockSizeY,
                      Value getBlockSizeZ, Value dynamicSharedMemorySize,
-                     Type asyncTokenType, ValueRange asyncDependencies) {
+                     Type asyncTokenType, ValueRange asyncDependencies,
+                     TypeRange workgroupAttributions,
+                     TypeRange privateAttributions) {
+  // Add a WorkGroup attribution attribute. This attribute is required to
+  // identify private attributions in the list of block argguments.
+  result.addAttribute(getNumWorkgroupAttributionsAttrName(),
+                      builder.getI64IntegerAttr(workgroupAttributions.size()));
+  // Add Op operands.
   if (asyncTokenType)
@@ -450,14 +512,21 @@ void LaunchOp::build(OpBuilder &builder, OperationState &result,
   if (dynamicSharedMemorySize)
-  // Create a kernel body region with kNumConfigRegionAttributes + N arguments,
-  // where the first kNumConfigRegionAttributes arguments have `index` type and
-  // the rest have the same types as the data operands.
+  // Create a kernel body region with kNumConfigRegionAttributes + N memory
+  // attributions, where the first kNumConfigRegionAttributes arguments have
+  // `index` type and the rest have the same types as the data operands.
   Region *kernelRegion = result.addRegion();
   Block *body = new Block();
+  // TODO: Allow passing in proper locations here.
   for (unsigned i = 0; i < kNumConfigRegionAttributes; ++i)
     body->addArgument(builder.getIndexType(), result.location);
+  // Add WorkGroup & Private attributions to the region arguments.
+  for (Type argTy : workgroupAttributions)
+    body->addArgument(argTy, result.location);
+  for (Type argTy : privateAttributions)
+    body->addArgument(argTy, result.location);
+  // Fill OperandSegmentSize Attribute.
   SmallVector<int32_t, 8> segmentSizes(8, 1);
   segmentSizes.front() = asyncDependencies.size();
   segmentSizes.back() = dynamicSharedMemorySize ? 1 : 0;
@@ -504,13 +573,18 @@ LogicalResult LaunchOp::verifyRegions() {
   // sizes and transforms them into kNumConfigRegionAttributes region arguments
   // for block/thread identifiers and grid/block sizes.
   if (!getBody().empty()) {
-    if (getBody().getNumArguments() !=
-        LaunchOp::kNumConfigOperands + getNumOperands() -
-            (getDynamicSharedMemorySize() ? 1 : 0) -
-            getAsyncDependencies().size())
+    if (getBody().getNumArguments() <
+        kNumConfigRegionAttributes + getNumWorkgroupAttributions())
       return emitOpError("unexpected number of region arguments");
+  // Verify Attributions Address Spaces.
+  if (failed(verifyAttributions(getOperation(), getWorkgroupAttributions(),
+                                GPUDialect::getWorkgroupAddressSpace())) ||
+      failed(verifyAttributions(getOperation(), getPrivateAttributions(),
+                                GPUDialect::getPrivateAddressSpace())))
+    return failure();
   // Block terminators without successors are expected to exit the kernel region
   // and must be `gpu.terminator`.
   for (Block &block : getBody()) {
@@ -563,10 +637,15 @@ void LaunchOp::print(OpAsmPrinter &p) {
     p << ' ' << getDynamicSharedMemorySizeKeyword() << ' '
       << getDynamicSharedMemorySize();
+  printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions());
+  printAttributions(p, getPrivateKeyword(), getPrivateAttributions());
   p << ' ';
   p.printRegion(getBody(), /*printEntryBlockArgs=*/false);
   p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{
-                              LaunchOp::getOperandSegmentSizeAttr()});
+                              LaunchOp::getOperandSegmentSizeAttr(),
+                              getNumWorkgroupAttributionsAttrName()});
 // Parse the size assignment blocks for blocks and threads.  These have the form
@@ -601,8 +680,9 @@ parseSizeAssignment(OpAsmParser &parser,
 /// Parses a Launch operation.
 /// operation ::= `gpu.launch` (`async` `[` ssa-id-list `]`)?
-//        `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
+///       `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
 ///       `threads` `(` ssa-id-list `)` `in` ssa-reassignment
+///       memory-attribution
 ///       region attr-dict?
 /// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
 ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
@@ -659,9 +739,12 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
       return failure();
-  // Introduce the body region and parse it. The region has
-  // kNumConfigRegionAttributes arguments that correspond to
-  // block/thread identifiers and grid/block sizes, all of the `index` type.
+  // Create the region arguments, it has kNumConfigRegionAttributes arguments
+  // that correspond to block/thread identifiers and grid/block sizes, all
+  // having `index` type, a variadic number of WorkGroup Attributions and
+  // a variadic number of Private Attributions. The number of WorkGroup
+  // Attributions is stored in the attr with name:
+  // LaunchOp::getNumWorkgroupAttributionsAttrName().
   Type index = parser.getBuilder().getIndexType();
   SmallVector<Type, LaunchOp::kNumConfigRegionAttributes> dataTypes(
       LaunchOp::kNumConfigRegionAttributes, index);
@@ -674,6 +757,27 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
+  Builder &builder = parser.getBuilder();
+  // Parse workgroup memory attributions.
+  if (failed(parseAttributions(parser, LaunchOp::getWorkgroupKeyword(),
+                               regionArguments)))
+    return failure();
+  // Store the number of operands we just parsed as the number of workgroup
+  // memory attributions.
+  unsigned numWorkgroupAttrs =
+      regionArguments.size() - LaunchOp::kNumConfigRegionAttributes;
+  result.addAttribute(LaunchOp::getNumWorkgroupAttributionsAttrName(),
+                      builder.getI64IntegerAttr(numWorkgroupAttrs));
+  // Parse private memory attributions.
+  if (failed(parseAttributions(parser, LaunchOp::getPrivateKeyword(),
+                               regionArguments)))
+    return failure();
+  // Introduce the body region and parse it. The region has
+  // kNumConfigRegionAttributes arguments that correspond to
+  // block/thread identifiers and grid/block sizes, all having `index` type.
   Region *body = result.addRegion();
   if (parser.parseRegion(*body, regionArguments) ||
@@ -729,6 +833,25 @@ void LaunchOp::getCanonicalizationPatterns(RewritePatternSet &rewrites,
+/// Adds a new block argument that corresponds to buffers located in
+/// workgroup memory.
+BlockArgument LaunchOp::addWorkgroupAttribution(Type type, Location loc) {
+  auto attrName = getNumWorkgroupAttributionsAttrName();
+  auto attr = (*this)->getAttrOfType<IntegerAttr>(attrName);
+  (*this)->setAttr(attrName,
+                   IntegerAttr::get(attr.getType(), attr.getValue() + 1));
+  return getBody().insertArgument(
+      LaunchOp::kNumConfigRegionAttributes + attr.getInt(), type, loc);
+/// Adds a new block argument that corresponds to buffers located in
+/// private memory.
+BlockArgument LaunchOp::addPrivateAttribution(Type type, Location loc) {
+  // Buffers on the private memory always come after buffers on the workgroup
+  // memory.
+  return getBody().addArgument(type, loc);
 // LaunchFuncOp
@@ -894,24 +1017,6 @@ void GPUFuncOp::build(OpBuilder &builder, OperationState &result,
-/// Parses a GPU function memory attribution.
-/// memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
-///                        (`private` `(` ssa-id-and-type-list `)`)?
-/// Note that this function parses only one of the two similar parts, with the
-/// keyword provided as argument.
-static ParseResult
-parseAttributions(OpAsmParser &parser, StringRef keyword,
-                  SmallVectorImpl<OpAsmParser::Argument> &args) {
-  // If we could not parse the keyword, just assume empty list and succeed.
-  if (failed(parser.parseOptionalKeyword(keyword)))
-    return success();
-  return parser.parseArgumentList(args, OpAsmParser::Delimiter::Paren,
-                                  /*allowType=*/true);
 /// Parses a GPU function.
 /// <operation> ::= `gpu.func` symbol-ref-id `(` argument-list `)`
@@ -985,17 +1090,6 @@ ParseResult GPUFuncOp::parse(OpAsmParser &parser, OperationState &result) {
   return parser.parseRegion(*body, entryArgs);
-static void printAttributions(OpAsmPrinter &p, StringRef keyword,
-                              ArrayRef<BlockArgument> values) {
-  if (values.empty())
-    return;
-  p << ' ' << keyword << '(';
-  llvm::interleaveComma(
-      values, p, [&p](BlockArgument v) { p << v << " : " << v.getType(); });
-  p << ')';
 void GPUFuncOp::print(OpAsmPrinter &p) {
   p << ' ';
@@ -1026,28 +1120,6 @@ LogicalResult GPUFuncOp::verifyType() {
   return success();
-static LogicalResult verifyAttributions(Operation *op,
-                                        ArrayRef<BlockArgument> attributions,
-                                        gpu::AddressSpace memorySpace) {
-  for (Value v : attributions) {
-    auto type = v.getType().dyn_cast<MemRefType>();
-    if (!type)
-      return op->emitOpError() << "expected memref type in attribution";
-    // We can only verify the address space if it hasn't already been lowered
-    // from the AddressSpaceAttr to a target-specific numeric value.
-    auto addressSpace =
-        type.getMemorySpace().dyn_cast_or_null<gpu::AddressSpaceAttr>();
-    if (!addressSpace)
-      continue;
-    if (addressSpace.getValue() != memorySpace)
-      return op->emitOpError()
-             << "expected memory space " << stringifyAddressSpace(memorySpace)
-             << " in attribution";
-  }
-  return success();
 /// Verifies the body of the function.
 LogicalResult GPUFuncOp::verifyBody() {
   if (empty())

diff  --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index 8abf75993a63d..91c1c763f070d 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -190,7 +190,10 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
   FunctionType type =
       FunctionType::get(launchOp.getContext(), kernelOperandTypes, {});
-  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(loc, kernelFnName, type);
+  auto outlinedFunc = builder.create<gpu::GPUFuncOp>(
+      loc, kernelFnName, type,
+      TypeRange(ValueRange(launchOp.getWorkgroupAttributions())),
+      TypeRange(ValueRange(launchOp.getPrivateAttributions())));
@@ -213,6 +216,16 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
   Region &outlinedFuncBody = outlinedFunc.getBody();
   injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map);
+  // Map memory attributions from the LaunOp op to the GPUFuncOp attributions.
+  for (const auto &[launchArg, funcArg] :
+       llvm::zip(launchOp.getWorkgroupAttributions(),
+                 outlinedFunc.getWorkgroupAttributions()))
+    map.map(launchArg, funcArg);
+  for (const auto &[launchArg, funcArg] :
+       llvm::zip(launchOp.getPrivateAttributions(),
+                 outlinedFunc.getPrivateAttributions()))
+    map.map(launchArg, funcArg);
   // Map arguments from gpu.launch region to the arguments of the gpu.func
   // operation.
   Block &entryBlock = outlinedFuncBody.front();

diff  --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 422e0c154dd47..ca776968d998b 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -310,3 +310,65 @@ func.func @non_constant_launches(%arg0 : index) {
 // CHECK-DL-LABEL: gpu.module @non_constant_launches_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+// CHECK: module attributes {gpu.container_module}
+// -----
+// This test checks memory attributions for gpu.launch, using both workgroup and private attributions.
+// CHECK-LABEL: func @launch_memory_attributions_0()
+func.func @launch_memory_attributions_0() {
+  %1 = "op"() : () -> (memref<?xf32, 1>)
+  %128 = arith.constant 128 : index
+  // CHECK: gpu.launch_func @launch_memory_attributions_0_kernel::@launch_memory_attributions_0_kernel
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %128, %grid_y = %128,
+                                       %grid_z = %128)
+             threads(%tx, %ty, %tz) in (%block_x = %128, %block_y = %128,
+                                        %block_z = %128)
+             workgroup(%shared: memref<42xf32, 3>)
+             private(%priv0: memref<2xf32, 5>, %priv1: memref<1xf32, 5>) {
+    "some_op"(%bx, %block_x) : (index, index) -> ()
+    %42 = memref.load %1[%tx] : memref<?xf32, 1>
+    %43 = memref.load %shared[%tx] : memref<42xf32, 3>
+    %44 = memref.load %priv1[%tx] : memref<1xf32, 5>
+    gpu.terminator
+  }
+  return
+// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_0_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}
+// CHECK-LABEL: gpu.module @launch_memory_attributions_0_kernel
+// CHECK-NEXT: gpu.func @launch_memory_attributions_0_kernel
+// CHECK-SAME: workgroup(%[[KERNEL_ARG1:.*]] : memref<42xf32, 3>)
+// CHECK-SAME: private(%[[KERNEL_ARG2:.*]] : memref<2xf32, 5>, %[[KERNEL_ARG3:.*]] : memref<1xf32, 5>)
+// CHECK: %[[TID:.*]] = gpu.thread_id x
+// CHECK: = memref.load %[[KERNEL_ARG1]][%[[TID]]] : memref<42xf32, 3>
+// CHECK-NEXT: = memref.load %[[KERNEL_ARG3]][%[[TID]]] : memref<1xf32, 5>
+// -----
+// This test checks correctness of private attributions in the absence of workgroup attributions.
+// CHECK-LABEL: @launch_memory_attributions_1
+func.func @launch_memory_attributions_1(%arg0 : memref<*xf32>) {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %d = memref.dim %arg0, %c2 : memref<*xf32>
+  // CHECK: gpu.func {{.*}}  private(%[[KERNEL_ARG:.*]] : memref<3xf32, 5>) {{.*}} {
+  // CHECK:   %[[C2:.*]] = arith.constant 2 : index
+  // CHECK: = memref.load %[[KERNEL_ARG]][%[[C2]]] : memref<3xf32, 5>
+  // CHECK:   gpu.return
+  // CHECK: }
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1,
+                                       %grid_z = %c1)
+             threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1,
+                                        %block_z = %c1)
+             private(%priv0: memref<3xf32, 5>) {
+    %42 = memref.load %priv0[%c2] : memref<3xf32, 5>
+    gpu.terminator
+  }
+  return
+// CHECK-DL-LABEL: gpu.module @launch_memory_attributions_1_kernel attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<index, 32 : i32>>}


