[Mlir-commits] [mlir] [mlir][GPU] Improve handling of GPU bounds (PR #95166)

Mon Jun 17 20:39:17 PDT 2024

https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/95166

>From de2679a2c84cd7be93fb6f81578cd227f9b1c040 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Tue, 11 Jun 2024 19:37:34 +0000
Subject: [PATCH 1/7] Let GPU ID bounds work on any FunctionOpInterfaces

This change removes the requirement that the known block or grid IDs
be stored on a gpu.func, but instead allows them on any function
implementing the FunctionOpInterface. This allows for, for instance,
non-kernel functions that live ina func.func or for downstream usecases
that don't use gpu.func.
---
 .../GPUCommon/IndexIntrinsicsOpLowering.h     |  6 +---
 .../GPU/IR/InferIntRangeInterfaceImpls.cpp    | 22 +++++++++++--
 .../test/Dialect/GPU/int-range-interface.mlir | 33 +++++++++++++++++++
 3 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index d067c70a90ea4..0f74768207205 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -57,11 +57,7 @@ struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {
       break;
     }
 
-    Operation *function;
-    if (auto gpuFunc = op->template getParentOfType<gpu::GPUFuncOp>())
-      function = gpuFunc;
-    if (auto llvmFunc = op->template getParentOfType<LLVM::LLVMFuncOp>())
-      function = llvmFunc;
+    Operation *function = op->template getParentOfType<FunctionOpInterface>();
     if (!boundsAttrName.empty() && function) {
       if (auto attr = function->template getAttrOfType<DenseI32ArrayAttr>(
               boundsAttrName)) {
diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index 69017efb9a0e6..152884e23b929 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -54,6 +55,17 @@ static Value valueByDim(KernelDim3 dims, Dimension dim) {
 
 static uint64_t zext(uint32_t arg) { return static_cast<uint64_t>(arg); }
 
+static std::optional<uint32_t> getKnownLaunchAttr(FunctionOpInterface func,
+                                                  StringRef attrName,
+                                                  Dimension dim) {
+  auto bounds = func.getOperation()->getAttrOfType<DenseI32ArrayAttr>(attrName);
+  if (!bounds)
+    return std::nullopt;
+  if (bounds.size() < static_cast<uint32_t>(dim))
+    return std::nullopt;
+  return bounds[static_cast<uint32_t>(dim)];
+}
+
 template <typename Op>
 static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
   Dimension dim = op.getDimension();
@@ -73,12 +85,16 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
       return value.getZExtValue();
   }
 
-  if (auto func = op->template getParentOfType<GPUFuncOp>()) {
+  if (auto func = op->template getParentOfType<FunctionOpInterface>()) {
     switch (type) {
     case LaunchDims::Block:
-      return llvm::transformOptional(func.getKnownBlockSize(dim), zext);
+      return llvm::transformOptional(
+          getKnownLaunchAttr(func, GPUFuncOp::getKnownBlockSizeAttrName(), dim),
+          zext);
     case LaunchDims::Grid:
-      return llvm::transformOptional(func.getKnownGridSize(dim), zext);
+      return llvm::transformOptional(
+          getKnownLaunchAttr(func, GPUFuncOp::getKnownGridSizeAttrName(), dim),
+          zext);
     }
   }
   return std::nullopt;
diff --git a/mlir/test/Dialect/GPU/int-range-interface.mlir b/mlir/test/Dialect/GPU/int-range-interface.mlir
index a0917a2fdf110..a6c74fec6e824 100644
--- a/mlir/test/Dialect/GPU/int-range-interface.mlir
+++ b/mlir/test/Dialect/GPU/int-range-interface.mlir
@@ -215,3 +215,36 @@ module attributes {gpu.container_module} {
   }
 }
 
+// -----
+
+// CHECK-LABEL: func @annotated_kernel
+module {
+  func.func @annotated_kernel()
+    attributes {gpu.known_block_size = array<i32: 8, 12, 16>,
+        gpu.known_grid_size = array<i32: 20, 24, 28>} {
+
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %block_id_z = gpu.block_id z
+
+    // CHECK: test.reflect_bounds {smax = 19 : index, smin = 0 : index, umax = 19 : index, umin = 0 : index}
+    // CHECK: test.reflect_bounds {smax = 23 : index, smin = 0 : index, umax = 23 : index, umin = 0 : index}
+    // CHECK: test.reflect_bounds {smax = 27 : index, smin = 0 : index, umax = 27 : index, umin = 0 : index}
+    %block_id_x0 = test.reflect_bounds %block_id_x : index
+    %block_id_y0 = test.reflect_bounds %block_id_y : index
+    %block_id_z0 = test.reflect_bounds %block_id_z : index
+
+    %thread_id_x = gpu.thread_id x
+    %thread_id_y = gpu.thread_id y
+    %thread_id_z = gpu.thread_id z
+
+    // CHECK: test.reflect_bounds {smax = 7 : index, smin = 0 : index, umax = 7 : index, umin = 0 : index}
+    // CHECK: test.reflect_bounds {smax = 11 : index, smin = 0 : index, umax = 11 : index, umin = 0 : index}
+    // CHECK: test.reflect_bounds {smax = 15 : index, smin = 0 : index, umax = 15 : index, umin = 0 : index}
+    %thread_id_x0 = test.reflect_bounds %thread_id_x : index
+    %thread_id_y0 = test.reflect_bounds %thread_id_y : index
+    %thread_id_z0 = test.reflect_bounds %thread_id_z : index
+
+    return
+  }
+}

>From 6c9d5de75cf1183e3785059cefe338d2390b4d0b Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 14 Jun 2024 02:15:26 +0000
Subject: [PATCH 2/7] The big generalization

Make known_block_size and known_grid_size inherent attributes on gpu.func.

Also make them visible discardable attributes on the GPU dialect.

Remove those weird attribute name getters from GPUFuncOp

Also add upperBound attributes to all the index operations so that people
who want to make this information local and not context-dependent
can do so.

Haven't updated tests yet, but I hope this round of generalizations will
address some of the concerns.
---
 mlir/include/mlir/Dialect/GPU/IR/GPUBase.td   |   5 +
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 162 ++++++++++++------
 .../Conversion/GPUCommon/GPUOpsLowering.cpp   |  40 +++--
 .../GPUCommon/IndexIntrinsicsOpLowering.h     |  81 +++++++--
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  24 +--
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      |  38 ++--
 .../Conversion/VectorToGPU/VectorToGPU.cpp    |   6 +-
 .../GPU/IR/InferIntRangeInterfaceImpls.cpp    | 101 ++++++++---
 .../GPU/Transforms/KernelOutlining.cpp        |   6 +-
 9 files changed, 327 insertions(+), 136 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index 7b9d46fda12f5..860f893367203 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -62,6 +62,11 @@ def GPU_Dialect : Dialect {
     static bool isWorkgroupMemoryAddressSpace(Attribute memorySpace);
   }];
 
+  let discardableAttrs = (ins
+    "::mlir::DenseI32ArrayAttr":$known_block_size,
+    "::mlir::DenseI32ArrayAttr":$known_grid_size
+  );
+
   let dependentDialects = ["arith::ArithDialect"];
   let useDefaultAttributePrinterParser = 1;
   let useDefaultTypePrinterParser = 1;
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index eb81b6469746f..67fe5c4ff31f0 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -54,8 +54,9 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
         Pure,
         DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
         DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>])>,
-    Arguments<(ins GPU_DimensionAttr:$dimension)>, Results<(outs Index)> {
-  let assemblyFormat = "$dimension attr-dict";
+    Arguments<(ins GPU_DimensionAttr:$dimension,
+                   OptionalAttr<IndexAttr>:$upper_bound)>, Results<(outs Index)> {
+  let assemblyFormat = "$dimension (`upper_bound` $upper_bound^)? attr-dict";
   let extraClassDefinition = [{
     void $cppClass::getAsmResultNames(
         llvm::function_ref<void(mlir::Value, mlir::StringRef)> setNameFn) {
@@ -66,6 +67,14 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
       setNameFn(getResult(),resultName);
     }
   }];
+  let builders = [
+    OpBuilder<(ins "::mlir::gpu::Dimension":$dimension), [{
+      build($_builder, $_state, dimension, /*upperBound=*/nullptr);
+    }]>,
+    OpBuilder<(ins "::mlir::Type":$resultType, "::mlir::gpu::Dimension":$dimension), [{
+      build($_builder, $_state, resultType, dimension, /*upperBound=*/nullptr);
+    }]>
+  ];
 }
 
 def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
@@ -78,6 +87,12 @@ def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
     ```mlir
     %cDimX = gpu.cluster_dim x
     ```
+
+    If `upper_bound` is set, then executing (a lowering of) this operation in an
+    environment where the cluster dimension is greater than `upper_bound` causes
+    undefined behavior.
+
+    There is an implicit upper bound of `kMaxClusterDim` (currently 8).
   }];
 }
 
@@ -91,6 +106,12 @@ def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> {
     ```mlir
     %cIdY = gpu.cluster_id y
     ```
+
+    If `upper_bound` is set, then executing (a lowering of) this operation in an
+    environment where the cluster dimension along `dimension` is greater than
+    `upper_bound` causes undefined behavior.
+
+    There is an implicit upper bound of `kMaxClusterDim` (currently 8).
   }];
 }
 
@@ -104,6 +125,19 @@ def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> {
     ```mlir
     %bDimX = gpu.block_dim x
     ```
+
+    If `known_block_size` is set on an this operation's enclosing `gpu.func`,
+    or `gpu.known_block_size` is set on an enclosing `FunctionOpInterface`
+    implementor, or if the enclosing `gpu.launch` specifies a constant size for
+    `dimension`'s blocks, these contextual facts may be used to infer that this
+    operation has a constant value, though such a transformation will not be
+    performed by canonicalization or the default constant folder. Executions which
+    cause that constant-value assumption to be false incur undefined behavior.
+
+    If `upper_bound` is set, executions where the bblock size along `dimension`
+    exceeds `upper_bound` cause undefined behavior.
+
+    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
   }];
 }
 def GPU_BlockIdOp : GPU_IndexOp<"block_id"> {
@@ -116,6 +150,13 @@ def GPU_BlockIdOp : GPU_IndexOp<"block_id"> {
     ```mlir
     %bIdY = gpu.block_id y
     ```
+
+    If `upper_bound` is set, or if one can be inferred from `known_grid_size`-type
+    annotations in context, executions where the block index in `dimension` would
+    be greater than or equal to that bound cause undefined behavior. `upper_bound`
+    takes priority over bounds inferrable from context.
+
+    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
   }];
 }
 def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> {
@@ -128,6 +169,20 @@ def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> {
     ```mlir
     %gDimZ = gpu.grid_dim z
     ```
+
+
+    If `known_grid_size` is set on an this operation's enclosing `gpu.func`,
+    or `gpu.known_grid_size` is set on an enclosing `FunctionOpInterface`
+    implementor, or if the enclosing `gpu.launch` specifies a constant size for
+    `dimension`'s grid length, these contextual facts may be used to infer that this
+    operation has a constant value, though such a transformation will not be
+    performed by canonicalization or the default constant folder. Executions which
+    cause that constant-value assumption to be false incur undefined behavior.
+
+    If `upper_bound` is set, executions where the grid size in `dimension` would
+    exceed `upper_bound` cause undefined behavior.
+
+    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
   }];
 }
 def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
@@ -140,6 +195,12 @@ def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
     ```mlir
     %tIdX = gpu.thread_id x
     ```
+
+    If `upper_bound` is set, or if one can be inferred from `known_block_size`-type
+    annotations in context, executions where the thread index would be greater
+    than or equal to that bound cause undefined behavior.
+
+    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
   }];
 }
 
@@ -152,14 +213,21 @@ def GPU_LaneIdOp : GPU_Op<"lane_id", [
     ```mlir
     %laneId = gpu.lane_id
     ```
+
+    If `upper_bound` is set, executions with more than `upper_bound` lanes per
+    subgroup cause undefined behavior. In the abscence of `upper_bound`,
+    the lane id is still assumed to be non-negative and less than the
+    target-independent `kMaxSubgroupSize` (currently 128).
   }];
+  let arguments = (ins OptionalAttr<IndexAttr>:$upper_bound);
   let results = (outs Index:$result);
-  let assemblyFormat = "attr-dict";
+  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict";
 }
 
 def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
       Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
-    Arguments<(ins)>, Results<(outs Index:$result)> {
+    Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
+    Results<(outs Index:$result)> {
   let description = [{
     Returns the subgroup id, i.e., the index of the current subgroup within the
     workgroup.
@@ -169,9 +237,13 @@ def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
     ```mlir
     %sgId = gpu.subgroup_id : index
     ```
+
+    Executions where there are more than `upper_bound` subgroups per workgroup
+    cause undefined behavior. There is an implicit upper bound of `kMaxDim`
+    (currently uint32_t::max).
   }];
 
-  let assemblyFormat = "attr-dict `:` type($result)";
+  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
 }
 
 def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
@@ -184,14 +256,20 @@ def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
 
     ```mlir
     %gidX = gpu.global_id x
+    %gidX = gpu.global_id x upper_bound 65536
     ```
+
+    The `upper_bound` attribute defines an upper bound analogously to the ones on
+    `thread_id` and `block_id`. If one is not set, the bound may be inferred from
+    a combination of `known_block_size` and `known_grid_size`-type annotations.
   }];
 }
 
 
 def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
       Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
-    Arguments<(ins)>, Results<(outs Index:$result)> {
+    Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
+    Results<(outs Index:$result)> {
   let description = [{
     Returns the number of subgroups within a workgroup.
 
@@ -200,14 +278,19 @@ def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
     ```mlir
     %numSg = gpu.num_subgroups : index
     ```
+
+    If `upper_bound` is set, executions with more than `upper_bound` subgroups
+    per workgroup cause undefined behavior. There is a default upper bound of
+    `kMaxDim` (currently uint32_t::max).
   }];
 
-  let assemblyFormat = "attr-dict `:` type($result)";
+  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
 }
 
 def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
       Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
-    Arguments<(ins)>, Results<(outs Index:$result)> {
+    Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
+    Results<(outs Index:$result)> {
   let description = [{
     Returns the number of threads within a subgroup.
 
@@ -216,11 +299,20 @@ def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
     ```mlir
     %sgSz = gpu.subgroup_size : index
     ```
+
+    Executions where the number of threads per subgroup exceed `upper_bound` cause
+    undefined behavior. When no `upper_bound` is specified, range analyses and
+    similar machinery assume the default bound of `kMaxSubgroupSize`, currently
+    128.
   }];
 
-  let assemblyFormat = "attr-dict `:` type($result)";
+  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
 }
 
+def GPU_OptionalDimSizeHintAttr : ConfinedAttr<OptionalAttr<DenseI32ArrayAttr>,
+  [AttrConstraint<Or<[IsNullAttr.predicate, DenseArrayCount<3>.predicate]>,
+    "array of 3 32-bit integers (if present)">]>;
+
 def GPU_GPUFuncOp : GPU_Op<"func", [
     HasParent<"GPUModuleOp">, AutomaticAllocationScope, FunctionOpInterface,
     IsolatedFromAbove
@@ -249,12 +341,14 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
     body region, are not supported.
 
     A function may optionally be annotated with the block and/or grid sizes
-    that will be used when it is launched using the `gpu.known_block_size` and
-    `gpu.known_grid_size` attributes, respectively. If set, these attributes must
+    that will be used when it is launched using the `known_block_size` and
+    `known_grid_size` attributes, respectively. If set, these attributes must
     be arrays of three 32-bit integers giving the x, y, and z launch dimensions.
     Launching a kernel that has these annotations, or that calls a function with
     these annotations, using a block size or grid size other than what is specified
-    is undefined behavior.
+    is undefined behavior. These attributes may be set on non-`gpu.func` functions
+    by using `gpu.known_block_size` or `gpu.known_grid_size`, but this carries
+    the risk that they will de discarded.
 
     Syntax:
 
@@ -297,7 +391,9 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
                        OptionalAttr<DictArrayAttr>:$arg_attrs,
                        OptionalAttr<DictArrayAttr>:$res_attrs,
                        OptionalAttr<DictArrayAttr>:$workgroup_attrib_attrs,
-                       OptionalAttr<DictArrayAttr>:$private_attrib_attrs);
+                       OptionalAttr<DictArrayAttr>:$private_attrib_attrs,
+                       GPU_OptionalDimSizeHintAttr:$known_block_size,
+                       GPU_OptionalDimSizeHintAttr:$known_grid_size);
   let regions = (region AnyRegion:$body);
 
   let skipDefaultBuilders = 1;
@@ -420,36 +516,6 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
       return "workgroup_attributions";
     }
 
-    static constexpr StringLiteral getKnownBlockSizeAttrName() {
-      return StringLiteral("gpu.known_block_size");
-    }
-
-    static constexpr StringLiteral getKnownGridSizeAttrName() {
-      return StringLiteral("gpu.known_grid_size");
-    }
-
-    /// Returns the block size this kernel will be launched with along
-    /// dimension `dim` if known. The value of gpu.thread_id dim will be strictly
-    /// less than this size.
-    std::optional<uint32_t> getKnownBlockSize(gpu::Dimension dim) {
-      if (auto array =
-        (*this)->getAttrOfType<DenseI32ArrayAttr>(getKnownBlockSizeAttrName())) {
-        return array[static_cast<uint32_t>(dim)];
-      }
-      return std::nullopt;
-    }
-
-    /// Returns the grid size this kernel will be launched with along
-    /// dimension `dim` if known. The value of gpu.block_id dim will be strictly
-    /// less than this size.
-    std::optional<uint32_t> getKnownGridSize(gpu::Dimension dim) {
-      if (auto array =
-        (*this)->getAttrOfType<DenseI32ArrayAttr>(getKnownGridSizeAttrName())) {
-        return array[static_cast<uint32_t>(dim)];
-      }
-      return std::nullopt;
-    }
-
     /// Returns the argument types of this function.
     ArrayRef<Type> getArgumentTypes() { return getFunctionType().getInputs(); }
 
@@ -692,8 +758,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
                Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
-               Optional<Index>:$clusterSizeX, 
-               Optional<Index>:$clusterSizeY, 
+               Optional<Index>:$clusterSizeX,
+               Optional<Index>:$clusterSizeY,
                Optional<Index>:$clusterSizeZ,
                Optional<I32>:$dynamicSharedMemorySize)>,
     Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
@@ -717,7 +783,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     to the amount of dynamic shared memory a kernel's workgroup should be
     allocated; when this operand is not present, a zero size is assumed.
 
-    The body region has at least _twelve_ arguments, or _eighteen_ if cluster 
+    The body region has at least _twelve_ arguments, or _eighteen_ if cluster
     dimensions are present, grouped as follows:
 
     -   three optional arguments that contain cluster identifiers along x,y,z
@@ -790,7 +856,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
                blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5)
                threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8)
     {
-      // Cluster, block and thread identifiers, as well as cluster/block/grid 
+      // Cluster, block and thread identifiers, as well as cluster/block/grid
       // sizes are immediately usable inside body region.
       "some_op"(%cx, %bx, %tx) : (index, index, index) -> ()
     }
@@ -867,7 +933,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     unsigned getNumConfigOperands() {
       return kNumConfigOperands + (hasClusterSize() ? 3 : 0);
     }
-    /// Returns the number of region attributes including cluster size 
+    /// Returns the number of region attributes including cluster size
     unsigned getNumConfigRegionAttributes() {
       return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0);
     }
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 36e10372e4bc5..7cf4604a3e20f 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -74,7 +74,9 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
         attr.getName() ==
             gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName() ||
         attr.getName() == gpuFuncOp.getWorkgroupAttribAttrsAttrName() ||
-        attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName())
+        attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName() ||
+        attr.getName() == gpuFuncOp.getKnownBlockSizeAttrName() ||
+        attr.getName() == gpuFuncOp.getKnownGridSizeAttrName())
       continue;
     if (attr.getName() == gpuFuncOp.getArgAttrsAttrName()) {
       argAttrs = gpuFuncOp.getArgAttrsAttr();
@@ -82,27 +84,31 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
     }
     attributes.push_back(attr);
   }
+
+  DenseI32ArrayAttr knownBlockSize = gpuFuncOp.getKnownBlockSizeAttr();
+  DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
+  // Ensure we don't lose information if the function is lowered before its
+  // surrounding context.
+  if (knownBlockSize)
+    attributes.emplace_back(
+        rewriter.getStringAttr(
+            gpu::GPUDialect::KnownBlockSizeAttrHelper::getNameStr()),
+        knownBlockSize);
+  if (knownGridSize)
+    attributes.emplace_back(
+        rewriter.getStringAttr(
+            gpu::GPUDialect::KnownGridSizeAttrHelper::getNameStr()),
+        knownGridSize);
+
   // Add a dialect specific kernel attribute in addition to GPU kernel
   // attribute. The former is necessary for further translation while the
   // latter is expected by gpu.launch_func.
   if (gpuFuncOp.isKernel()) {
     attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
-
-    // Set the block size attribute if it is present.
-    if (kernelBlockSizeAttributeName.has_value()) {
-      std::optional<int32_t> dimX =
-          gpuFuncOp.getKnownBlockSize(gpu::Dimension::x);
-      std::optional<int32_t> dimY =
-          gpuFuncOp.getKnownBlockSize(gpu::Dimension::y);
-      std::optional<int32_t> dimZ =
-          gpuFuncOp.getKnownBlockSize(gpu::Dimension::z);
-      if (dimX.has_value() || dimY.has_value() || dimZ.has_value()) {
-        // If any of the dimensions are missing, fill them in with 1.
-        attributes.emplace_back(
-            kernelBlockSizeAttributeName.value(),
-            rewriter.getDenseI32ArrayAttr(
-                {dimX.value_or(1), dimY.value_or(1), dimZ.value_or(1)}));
-      }
+    // Set the dialect-specific block size attribute if there is one.
+    if (kernelBlockSizeAttributeName.has_value() && knownBlockSize) {
+      attributes.emplace_back(kernelBlockSizeAttributeName.value(),
+                              knownBlockSize);
     }
   }
   auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index 0f74768207205..e4cd24e0380e7 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -14,6 +14,14 @@
 #include "mlir/IR/BuiltinAttributes.h"
 
 namespace mlir {
+namespace gpu {
+namespace index_lowering {
+enum class IndexKind : uint32_t { Other = 0, Block = 1, Grid = 2 };
+enum class IntrType : uint32_t {
+  None = 0,
+  Id = 1,
+  Dim = 2,
+};
 
 // Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension
 // that Op operates on.  Op is assumed to return an `index` value and
@@ -21,22 +29,23 @@ namespace mlir {
 // `indexBitwidth`, sign-extend or truncate the resulting value to match the
 // bitwidth expected by the consumers of the value.
 template <typename Op, typename XOp, typename YOp, typename ZOp>
-struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {
+struct OpLowering : public ConvertOpToLLVMPattern<Op> {
 private:
   unsigned indexBitwidth;
-  StringRef boundsAttrName;
+  IndexKind indexKind;
+  IntrType intrType;
 
 public:
-  explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter)
+  explicit OpLowering(LLVMTypeConverter &typeConverter)
       : ConvertOpToLLVMPattern<Op>(typeConverter),
         indexBitwidth(typeConverter.getIndexTypeBitwidth()),
-        boundsAttrName("") {}
+        indexKind(IndexKind::Other), intrType(IntrType::None) {}
 
-  explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter,
-                                       StringRef boundsAttrName)
+  explicit OpLowering(LLVMTypeConverter &typeConverter, IndexKind indexKind,
+                      IntrType intrType)
       : ConvertOpToLLVMPattern<Op>(typeConverter),
         indexBitwidth(typeConverter.getIndexTypeBitwidth()),
-        boundsAttrName(boundsAttrName) {}
+        indexKind(indexKind), intrType(intrType) {}
 
   // Convert the kernel arguments to an LLVM type, preserve the rest.
   LogicalResult
@@ -57,15 +66,58 @@ struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {
       break;
     }
 
-    Operation *function = op->template getParentOfType<FunctionOpInterface>();
-    if (!boundsAttrName.empty() && function) {
-      if (auto attr = function->template getAttrOfType<DenseI32ArrayAttr>(
-              boundsAttrName)) {
-        int32_t maximum = attr[static_cast<uint32_t>(op.getDimension())];
-        newOp->setAttr("range", rewriter.getDenseI32ArrayAttr({0, maximum}));
+    // Order of priority for bounds:
+    // 1. The upper_bound attribute
+    // 2. Inherent attributes on a surrounding gpu.func
+    // 3. Discardable attributes on a surrounding function of any kind
+    // The below code handles these in reverse order so that more important
+    // sources overwrite less important ones.
+    DenseI32ArrayAttr funcBounds = nullptr;
+    if (auto funcOp = op->template getParentOfType<FunctionOpInterface>()) {
+      switch (indexKind) {
+      case IndexKind::Block: {
+        auto blockHelper =
+            gpu::GPUDialect::KnownBlockSizeAttrHelper(op.getContext());
+        if (blockHelper.isAttrPresent(funcOp))
+          funcBounds = blockHelper.getAttr(funcOp);
+        break;
+      }
+      case IndexKind::Grid: {
+        auto gridHelper =
+            gpu::GPUDialect::KnownGridSizeAttrHelper(op.getContext());
+        if (gridHelper.isAttrPresent(funcOp))
+          funcBounds = gridHelper.getAttr(funcOp);
+        break;
+      }
+      case IndexKind::Other:
+        break;
       }
     }
+    if (auto gpuFunc = op->template getParentOfType<gpu::GPUFuncOp>()) {
+      switch (indexKind) {
+      case IndexKind::Block:
+        funcBounds = gpuFunc.getKnownBlockSizeAttr();
+        break;
+      case IndexKind::Grid:
+        funcBounds = gpuFunc.getKnownGridSizeAttr();
+        break;
+      case IndexKind::Other:
+        break;
+      }
+    }
+    std::optional<int32_t> upperBound;
+    if (funcBounds)
+      upperBound =
+          funcBounds.asArrayRef()[static_cast<uint32_t>(op.getDimension())];
+    if (auto opBound = op.getUpperBound())
+      upperBound = opBound->getZExtValue();
 
+    if (upperBound && intrType != IntrType::None) {
+      int32_t min = (intrType == IntrType::Dim ? 1 : 0);
+      int32_t max = *upperBound - (intrType == IntrType::Id ? 0 : 1);
+      newOp->setAttr(
+          "range", DenseI32ArrayAttr::get(op.getContext(), ArrayRef{min, max}));
+    }
     if (indexBitwidth > 32) {
       newOp = rewriter.create<LLVM::SExtOp>(
           loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0));
@@ -78,7 +130,8 @@ struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {
     return success();
   }
 };
-
+} // namespace index_lowering
+} // namespace gpu
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index b95fba20a00cb..ef1cc345149aa 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -336,18 +336,18 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
   populateWithGenerated(patterns);
   patterns.add<GPUPrintfOpToVPrintfLowering>(converter);
   patterns.add<
-      GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
-                                  NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
-      GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
-                                  NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
-      GPUIndexIntrinsicOpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
-                                  NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
-      GPUIndexIntrinsicOpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
-                                  NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
-      GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
-                                  NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
-      GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
-                                  NVVM::GridDimYOp, NVVM::GridDimZOp>,
+      gpu::index_lowering::OpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
+                                      NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
+      gpu::index_lowering::OpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
+                                      NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
+      gpu::index_lowering::OpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
+                                      NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
+      gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
+                                      NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
+      gpu::index_lowering::OpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
+                                      NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
+      gpu::index_lowering::OpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
+                                      NVVM::GridDimYOp, NVVM::GridDimZOp>,
       GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(converter);
 
   patterns.add<GPUDynamicSharedMemoryOpLowering>(
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 70dcccf0a7307..967452bb88173 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -305,9 +305,8 @@ struct LowerGpuOpsToROCDLOpsPass
     // Manually rewrite known block size attributes so the LLVMIR translation
     // infrastructure can pick them up.
     m.walk([&](LLVM::LLVMFuncOp op) {
-      if (auto blockSizes = dyn_cast_or_null<DenseI32ArrayAttr>(
-              op->removeAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName()))) {
-        reqdWorkGroupSizeAttrHelper.setAttr(op, blockSizes);
+      if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
+        auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
         // Also set up the rocdl.flat_work_group_size attribute to prevent
         // conflicting metadata.
         uint32_t flatSize = 1;
@@ -349,27 +348,32 @@ static void populateOpPatterns(LLVMTypeConverter &converter,
 void mlir::populateGpuToROCDLConversionPatterns(
     LLVMTypeConverter &converter, RewritePatternSet &patterns,
     mlir::gpu::amd::Runtime runtime) {
+  using gpu::index_lowering::IndexKind;
+  using gpu::index_lowering::IntrType;
   using mlir::gpu::amd::Runtime;
-
   populateWithGenerated(patterns);
-  patterns
-      .add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
-                                       ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
-          converter, gpu::GPUFuncOp::getKnownBlockSizeAttrName());
-  patterns.add<GPUIndexIntrinsicOpLowering<
+  patterns.add<
+      gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
+                                      ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
+      converter, IndexKind::Block, IntrType::Id);
+  patterns.add<gpu::index_lowering::OpLowering<
       gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
-      converter, gpu::GPUFuncOp::getKnownGridSizeAttrName());
-  patterns
-      .add<GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
-                                       ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
-           GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
-                                       ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
-           GPUReturnOpLowering>(converter);
+      converter, IndexKind::Grid, IntrType::Id);
+  patterns.add<
+      gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
+                                      ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
+      converter, IndexKind::Block, IntrType::Dim);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
+      converter, IndexKind::Grid, IntrType::Dim);
+  patterns.add<GPUReturnOpLowering>(converter);
   patterns.add<GPUFuncOpLowering>(
       converter,
       /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
       /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
-      ROCDL::ROCDLDialect::KernelAttrHelper(&converter.getContext()).getName());
+      ROCDL::ROCDLDialect::KernelAttrHelper(&converter.getContext()).getName(),
+      ROCDL::ROCDLDialect::ReqdWorkGroupSizeAttrHelper(&converter.getContext())
+          .getName());
   if (Runtime::HIP == runtime) {
     patterns.add<GPUPrintfOpToHIPLowering>(converter);
   } else if (Runtime::OpenCL == runtime) {
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index 4496c2bc5fe8b..0fd91b27b7d21 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -743,7 +743,7 @@ creatLdMatrixCompatibleLoads(RewriterBase &rewriter, vector::TransferReadOp op,
   }
 
   // Adjust the load offset.
-  auto laneId = rewriter.create<gpu::LaneIdOp>(loc);
+  auto laneId = rewriter.create<gpu::LaneIdOp>(loc, /*upperBound=*/nullptr);
   FailureOr<AffineMap> offsets =
       nvgpu::getLaneIdToLdMatrixMatrixCoord(rewriter, loc, *params);
   if (failed(offsets)) {
@@ -782,7 +782,7 @@ createNonLdMatrixLoads(RewriterBase &rewriter, vector::TransferReadOp op,
             "conversion to distributed non-ldmatrix compatible load");
   }
 
-  Value laneId = rewriter.create<gpu::LaneIdOp>(loc);
+  Value laneId = rewriter.create<gpu::LaneIdOp>(loc, /*upperBound=*/nullptr);
   SmallVector<Value, 4> elements;
 
   // This is the individual element type.
@@ -917,7 +917,7 @@ convertTransferWriteToStores(RewriterBase &rewriter, vector::TransferWriteOp op,
     return rewriter.notifyMatchFailure(op, "not mma sync reg info");
 
   VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
-  Value laneId = rewriter.create<gpu::LaneIdOp>(loc);
+  Value laneId = rewriter.create<gpu::LaneIdOp>(loc, /*upperBound=*/nullptr);
 
   for (unsigned i = 0; i < vectorType.getShape()[0]; i++) {
     Value logicalValueId = rewriter.create<arith::ConstantOp>(
diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index 152884e23b929..3b555983ad0cc 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -55,7 +55,25 @@ static Value valueByDim(KernelDim3 dims, Dimension dim) {
 
 static uint64_t zext(uint32_t arg) { return static_cast<uint64_t>(arg); }
 
-static std::optional<uint32_t> getKnownLaunchAttr(FunctionOpInterface func,
+static std::optional<uint64_t>
+getKnownLaunchAttr(GPUFuncOp func, LaunchDims dims, Dimension dim) {
+  DenseI32ArrayAttr bounds;
+  switch (dims) {
+  case LaunchDims::Block:
+    bounds = func.getKnownBlockSizeAttr();
+    break;
+  case LaunchDims::Grid:
+    bounds = func.getKnownGridSizeAttr();
+    break;
+  }
+  if (!bounds)
+    return std::nullopt;
+  if (bounds.size() < static_cast<uint32_t>(dim))
+    return std::nullopt;
+  return zext(bounds[static_cast<uint32_t>(dim)]);
+}
+
+static std::optional<uint64_t> getKnownLaunchAttr(FunctionOpInterface func,
                                                   StringRef attrName,
                                                   Dimension dim) {
   auto bounds = func.getOperation()->getAttrOfType<DenseI32ArrayAttr>(attrName);
@@ -63,7 +81,7 @@ static std::optional<uint32_t> getKnownLaunchAttr(FunctionOpInterface func,
     return std::nullopt;
   if (bounds.size() < static_cast<uint32_t>(dim))
     return std::nullopt;
-  return bounds[static_cast<uint32_t>(dim)];
+  return zext(bounds[static_cast<uint32_t>(dim)]);
 }
 
 template <typename Op>
@@ -85,29 +103,41 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
       return value.getZExtValue();
   }
 
+  if (auto gpuFunc = op->template getParentOfType<GPUFuncOp>()) {
+    auto inherentAttr = getKnownLaunchAttr(gpuFunc, type, dim);
+    if (inherentAttr)
+      return inherentAttr;
+  }
   if (auto func = op->template getParentOfType<FunctionOpInterface>()) {
+    StringRef attrName;
     switch (type) {
     case LaunchDims::Block:
-      return llvm::transformOptional(
-          getKnownLaunchAttr(func, GPUFuncOp::getKnownBlockSizeAttrName(), dim),
-          zext);
+      attrName = GPUDialect::KnownBlockSizeAttrHelper::getNameStr();
+      break;
     case LaunchDims::Grid:
-      return llvm::transformOptional(
-          getKnownLaunchAttr(func, GPUFuncOp::getKnownGridSizeAttrName(), dim),
-          zext);
+      attrName = GPUDialect::KnownGridSizeAttrHelper::getNameStr();
+      break;
     }
+    auto discardableAttr = getKnownLaunchAttr(func, attrName, dim);
+    if (discardableAttr)
+      return discardableAttr;
   }
   return std::nullopt;
 }
 
 void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                      SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), getIndexRange(1, kMaxClusterDim));
+  uint64_t max = kMaxClusterDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(1, max));
 }
 
 void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                     SetIntRangeFn setResultRange) {
   uint64_t max = kMaxClusterDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
@@ -116,14 +146,21 @@ void BlockDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
   std::optional<uint64_t> knownVal =
       getKnownLaunchDim(*this, LaunchDims::Block);
   if (knownVal)
-    setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
-  else
-    setResultRange(getResult(), getIndexRange(1, kMaxDim));
+    return setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
+  ;
+  uint64_t max = kMaxDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(1, max));
 }
 
 void BlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                   SetIntRangeFn setResultRange) {
-  uint64_t max = getKnownLaunchDim(*this, LaunchDims::Grid).value_or(kMaxDim);
+  uint64_t max = kMaxDim;
+  if (auto fromContext = getKnownLaunchDim(*this, LaunchDims::Grid))
+    max = fromContext.value();
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
@@ -131,29 +168,45 @@ void GridDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                   SetIntRangeFn setResultRange) {
   std::optional<uint64_t> knownVal = getKnownLaunchDim(*this, LaunchDims::Grid);
   if (knownVal)
-    setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
-  else
-    setResultRange(getResult(), getIndexRange(1, kMaxDim));
+    return setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
+  uint64_t max = kMaxDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(1, max));
 }
 
 void ThreadIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                    SetIntRangeFn setResultRange) {
-  uint64_t max = getKnownLaunchDim(*this, LaunchDims::Block).value_or(kMaxDim);
+  uint64_t max = kMaxDim;
+  if (auto fromContext = getKnownLaunchDim(*this, LaunchDims::Block))
+    max = fromContext.value();
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
 void LaneIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                  SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL));
+  uint64_t max = kMaxSubgroupSize;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
 void SubgroupIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                      SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), getIndexRange(0, kMaxDim - 1ULL));
+  uint64_t max = kMaxDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
 void GlobalIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                    SetIntRangeFn setResultRange) {
+  if (auto specified = getUpperBound())
+    return setResultRange(getResult(),
+                          getIndexRange(0, specified->getZExtValue() - 1ULL));
+
   uint64_t blockDimMax =
       getKnownLaunchDim(*this, LaunchDims::Block).value_or(kMaxDim);
   uint64_t gridDimMax =
@@ -164,12 +217,18 @@ void GlobalIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 
 void NumSubgroupsOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                        SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), getIndexRange(1, kMaxDim));
+  uint64_t max = kMaxDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(1, max));
 }
 
 void SubgroupSizeOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                        SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), getIndexRange(1, kMaxSubgroupSize));
+  uint64_t max = kMaxSubgroupSize;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(1, max));
 }
 
 void LaunchOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index f5e80553ae72a..5f6556d915f41 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -208,12 +208,10 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
   // because multiple launches with the same body are not deduplicated.
   if (auto blockBounds =
           maybeConstantDimsAttr(launchOp.getBlockSizeOperandValues()))
-    outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName(),
-                          blockBounds);
+    outlinedFunc.setKnownBlockSizeAttr(blockBounds);
   if (auto gridBounds =
           maybeConstantDimsAttr(launchOp.getGridSizeOperandValues()))
-    outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownGridSizeAttrName(),
-                          gridBounds);
+    outlinedFunc.setKnownGridSizeAttr(gridBounds);
 
   IRMapping map;
 

>From eec884affeaea21ae1078eb969b3a01e2e66a48c Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 14 Jun 2024 22:47:10 +0000
Subject: [PATCH 3/7] Update tests

---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    |  4 +-
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  2 +-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 37 ++++----
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     |  4 +-
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   | 20 +++--
 .../test/Dialect/GPU/int-range-interface.mlir | 85 ++++++++++++++++++-
 mlir/test/Dialect/GPU/invalid.mlir            | 21 +++--
 mlir/test/Dialect/GPU/outlining.mlir          | 16 ++--
 8 files changed, 138 insertions(+), 51 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 771ae5c99a05a..6fff953adf091 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -348,7 +348,7 @@ def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
 
 def GPU_OptionalDimSizeHintAttr : ConfinedAttr<OptionalAttr<DenseI32ArrayAttr>,
   [AttrConstraint<Or<[IsNullAttr.predicate, DenseArrayCount<3>.predicate]>,
-    "array of 3 32-bit integers (if present)">]>;
+    "with 3 elements (if present)">]>;
 
 def GPU_GPUFuncOp : GPU_Op<"func", [
     HasParent<"GPUModuleOp">, AutomaticAllocationScope, FunctionOpInterface,
@@ -573,8 +573,6 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
     LogicalResult verifyBody();
   }];
   let hasCustomAssemblyFormat = 1;
-
-  let hasVerifier = 1;
 }
 
 def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index dbe4848ab5961..fea8a0ddc7f06 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -348,7 +348,7 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
           gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
           NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>,
       gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
-                                  NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
+                                      NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
       gpu::index_lowering::OpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
                                       NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
       gpu::index_lowering::OpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index d8e29da6512d4..edba0290a4581 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -309,8 +309,24 @@ void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
       .Default([](Type) { llvm_unreachable("unexpected 'gpu' type kind"); });
 }
 
+static LogicalResult verifyKnownLaunchSizeAttr(Operation *op,
+                                               NamedAttribute attr) {
+  auto array = llvm::dyn_cast<DenseI32ArrayAttr>(attr.getValue());
+  if (!array)
+    return op->emitOpError(Twine(attr.getName()) +
+                           " must be a dense i32 array");
+  if (array.size() != 3)
+    return op->emitOpError(Twine(attr.getName()) +
+                           " must contain exactly 3 elements");
+  return success();
+}
+
 LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
                                                    NamedAttribute attr) {
+  if (attr.getName() == getKnownBlockSizeAttrHelper().getName())
+    return verifyKnownLaunchSizeAttr(op, attr);
+  if (attr.getName() == getKnownGridSizeAttrHelper().getName())
+    return verifyKnownLaunchSizeAttr(op, attr);
   if (!llvm::isa<UnitAttr>(attr.getValue()) ||
       attr.getName() != getContainerModuleAttrName())
     return success();
@@ -1676,27 +1692,6 @@ LogicalResult GPUFuncOp::verifyBody() {
   return success();
 }
 
-static LogicalResult verifyKnownLaunchSizeAttr(gpu::GPUFuncOp op,
-                                               StringRef attrName) {
-  auto maybeAttr = op->getAttr(attrName);
-  if (!maybeAttr)
-    return success();
-  auto array = llvm::dyn_cast<DenseI32ArrayAttr>(maybeAttr);
-  if (!array)
-    return op.emitOpError(attrName + " must be a dense i32 array");
-  if (array.size() != 3)
-    return op.emitOpError(attrName + " must contain exactly 3 elements");
-  return success();
-}
-
-LogicalResult GPUFuncOp::verify() {
-  if (failed(verifyKnownLaunchSizeAttr(*this, getKnownBlockSizeAttrName())))
-    return failure();
-  if (failed(verifyKnownLaunchSizeAttr(*this, getKnownGridSizeAttrName())))
-    return failure();
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // ReturnOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index cd701095d8e64..c57cfd2977836 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -672,8 +672,8 @@ gpu.module @test_module_32 {
 
 gpu.module @gpumodule {
 // CHECK-LABEL: func @kernel_with_block_size()
-// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 128, 1, 1>} 
-  gpu.func @kernel_with_block_size() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
+// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 128, 1, 1>}
+  gpu.func @kernel_with_block_size() kernel attributes {known_block_size = array<i32: 128, 1, 1>} {
     gpu.return
   }
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index a8d61a6a0f6fd..bf49a42a11577 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -70,13 +70,12 @@ gpu.module @test_module {
 // -----
 
 gpu.module @test_module {
-  // CHECK-LABEL: func @gpu_index_ops_range()
+  // CHECK-LABEL: func @gpu_index_ops_range
   // CHECK-SAME: rocdl.flat_work_group_size = "1536,1536"
   // CHECK-SAME: rocdl.reqd_work_group_size = array<i32: 8, 12, 16>
-  func.func @gpu_index_ops_range()
-      -> (index, index, index, index, index, index) attributes
-      {gpu.known_block_size = array<i32: 8, 12, 16>,
-       gpu.known_grid_size = array<i32: 20, 24, 28>} {
+  gpu.func @gpu_index_ops_range(%place: memref<i32>)  kernel attributes
+      {known_block_size = array<i32: 8, 12, 16>,
+       known_grid_size = array<i32: 20, 24, 28>} {
 
     // CHECK: rocdl.workitem.id.x {range = array<i32: 0, 8>} : i32
     %tIdX = gpu.thread_id x
@@ -92,8 +91,15 @@ gpu.module @test_module {
     // CHECK: rocdl.workgroup.id.z {range = array<i32: 0, 28>} : i32
     %bIdZ = gpu.block_id z
 
-    func.return %tIdX, %tIdY, %tIdZ, %bIdX, %bIdY, %bIdZ
-        : index, index, index, index, index, index
+    // "Usage" to make the ID calls not die
+    %0 = arith.addi %tIdX, %tIdY : index
+    %1 = arith.addi %0, %tIdZ : index
+    %2 = arith.addi %1, %bIdX : index
+    %3 = arith.addi %2, %bIdY : index
+    %4 = arith.addi %3, %bIdZ : index
+    %5 = arith.index_cast %4 : index to i32
+    memref.store %5, %place[] : memref<i32>
+    gpu.return
   }
 }
 
diff --git a/mlir/test/Dialect/GPU/int-range-interface.mlir b/mlir/test/Dialect/GPU/int-range-interface.mlir
index a6c74fec6e824..1613f83b17bde 100644
--- a/mlir/test/Dialect/GPU/int-range-interface.mlir
+++ b/mlir/test/Dialect/GPU/int-range-interface.mlir
@@ -138,8 +138,8 @@ module attributes {gpu.container_module} {
 module attributes {gpu.container_module} {
   gpu.module @gpu_module {
     gpu.func @annotated_kernel() kernel
-      attributes {gpu.known_block_size = array<i32: 8, 12, 16>,
-          gpu.known_grid_size = array<i32: 20, 24, 28>} {
+      attributes {known_block_size = array<i32: 8, 12, 16>,
+          known_grid_size = array<i32: 20, 24, 28>} {
 
       %grid_dim_x = gpu.grid_dim x
       %grid_dim_y = gpu.grid_dim y
@@ -248,3 +248,84 @@ module {
     return
   }
 }
+
+// -----
+
+// CHECK-LABEL: func @local_bounds_kernel
+module attributes {gpu.container_module} {
+  gpu.module @gpu_module {
+    gpu.func @local_bounds_kernel() kernel {
+
+      %grid_dim_x = gpu.grid_dim x upper_bound 20
+      %grid_dim_y = gpu.grid_dim y upper_bound 24
+      %grid_dim_z = gpu.grid_dim z upper_bound 28
+
+      // CHECK: test.reflect_bounds {smax = 20 : index, smin = 1 : index, umax = 20 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 24 : index, smin = 1 : index, umax = 24 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 28 : index, smin = 1 : index, umax = 28 : index, umin = 1 : index}
+      %grid_dim_x0 = test.reflect_bounds %grid_dim_x : index
+      %grid_dim_y0 = test.reflect_bounds %grid_dim_y : index
+      %grid_dim_z0 = test.reflect_bounds %grid_dim_z : index
+
+      %block_id_x = gpu.block_id x upper_bound 20
+      %block_id_y = gpu.block_id y upper_bound 24
+      %block_id_z = gpu.block_id z upper_bound 28
+
+      // CHECK: test.reflect_bounds {smax = 19 : index, smin = 0 : index, umax = 19 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 23 : index, smin = 0 : index, umax = 23 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 27 : index, smin = 0 : index, umax = 27 : index, umin = 0 : index}
+      %block_id_x0 = test.reflect_bounds %block_id_x : index
+      %block_id_y0 = test.reflect_bounds %block_id_y : index
+      %block_id_z0 = test.reflect_bounds %block_id_z : index
+
+      %block_dim_x = gpu.block_dim x upper_bound 8
+      %block_dim_y = gpu.block_dim y upper_bound 12
+      %block_dim_z = gpu.block_dim z upper_bound 16
+
+      // CHECK: test.reflect_bounds {smax = 8 : index, smin = 1 : index, umax = 8 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 12 : index, smin = 1 : index, umax = 12 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 16 : index, smin = 1 : index, umax = 16 : index, umin = 1 : index}
+      %block_dim_x0 = test.reflect_bounds %block_dim_x : index
+      %block_dim_y0 = test.reflect_bounds %block_dim_y : index
+      %block_dim_z0 = test.reflect_bounds %block_dim_z : index
+
+      %thread_id_x = gpu.thread_id x upper_bound 8
+      %thread_id_y = gpu.thread_id y upper_bound 12
+      %thread_id_z = gpu.thread_id z upper_bound 16
+
+      // CHECK: test.reflect_bounds {smax = 7 : index, smin = 0 : index, umax = 7 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 11 : index, smin = 0 : index, umax = 11 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 15 : index, smin = 0 : index, umax = 15 : index, umin = 0 : index}
+      %thread_id_x0 = test.reflect_bounds %thread_id_x : index
+      %thread_id_y0 = test.reflect_bounds %thread_id_y : index
+      %thread_id_z0 = test.reflect_bounds %thread_id_z : index
+
+      %global_id_x = gpu.global_id x upper_bound 160
+      %global_id_y = gpu.global_id y upper_bound 288
+      %global_id_z = gpu.global_id z upper_bound 448
+
+      // CHECK: test.reflect_bounds {smax = 159 : index, smin = 0 : index, umax = 159 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 287 : index, smin = 0 : index, umax = 287 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 447 : index, smin = 0 : index, umax = 447 : index, umin = 0 : index}
+      %global_id_x0 = test.reflect_bounds %global_id_x : index
+      %global_id_y0 = test.reflect_bounds %global_id_y : index
+      %global_id_z0 = test.reflect_bounds %global_id_z : index
+
+      %subgroup_size = gpu.subgroup_size upper_bound 32 : index
+      %subgroup_id = gpu.subgroup_id upper_bound 32 : index
+      %num_subgroups = gpu.num_subgroups upper_bound 8 : index
+      %lane_id = gpu.lane_id upper_bound 64
+
+      // CHECK: test.reflect_bounds {smax = 32 : index, smin = 1 : index, umax = 32 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 31 : index, smin = 0 : index, umax = 31 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 8 : index, smin = 1 : index, umax = 8 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 63 : index, smin = 0 : index, umax = 63 : index, umin = 0 : index}
+      %subgroup_size0 = test.reflect_bounds %subgroup_size : index
+      %subgroup_id0 = test.reflect_bounds %subgroup_id : index
+      %num_subgroups0 = test.reflect_bounds %num_subgroups : index
+      %lane_id0 = test.reflect_bounds %lane_id : index
+
+      gpu.return
+    }
+  }
+}
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index 273bc282b0b3b..e9d8f329be8ed 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -704,11 +704,9 @@ func.func @alloc() {
 // -----
 
 module attributes {gpu.container_module} {
-  gpu.module @kernel {
-    // expected-error at +1 {{'gpu.func' op gpu.known_block_size must be a dense i32 array}}
-    gpu.func @kernel() kernel attributes {gpu.known_block_size = 32 : i32} {
-      gpu.return
-    }
+  // expected-error at +1 {{'func.func' op gpu.known_block_size must be a dense i32 array}}
+  func.func @kernel() attributes {gpu.known_block_size = 32 : i32} {
+    func.return
   }
 }
 
@@ -716,8 +714,8 @@ module attributes {gpu.container_module} {
 
 module attributes {gpu.container_module} {
   gpu.module @kernel {
-    // expected-error at +1 {{'gpu.func' op gpu.known_block_size must contain exactly 3 elements}}
-    gpu.func @kernel() kernel attributes {gpu.known_block_size = array<i32: 2, 1>} {
+    // expected-error at +1 {{'gpu.func' op attribute 'known_block_size' failed to satisfy constraint: i32 dense array attribute with 3 elements (if present)}}
+    gpu.func @kernel() kernel attributes {known_block_size = array<i32: 2, 1>} {
       gpu.return
     }
   }
@@ -725,6 +723,15 @@ module attributes {gpu.container_module} {
 
 // -----
 
+module {
+  // expected-error at +1 {{'func.func' op gpu.known_block_size must contain exactly 3 elements}}
+  func.func @kernel() attributes {gpu.known_block_size = array<i32: 2, 1>} {
+    func.return
+  }
+}
+
+// -----
+
 module {
   // expected-error @+1 {{'gpu.module' op attribute 'targets' failed to satisfy constraint: array of GPU target attributes with at least 1 elements}}
   gpu.module @gpu_funcs [] {
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 47ebe326b5d12..7f44f11b47e06 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -40,8 +40,8 @@ func.func @launch() {
 // CHECK-LABEL: gpu.module @launch_kernel
 // CHECK-NEXT: gpu.func @launch_kernel
 // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
-// CHECK-SAME: gpu.known_block_size = array<i32: 20, 24, 28>
-// CHECK-SAME: gpu.known_grid_size = array<i32: 8, 12, 16>
+// CHECK-SAME: known_block_size = array<i32: 20, 24, 28>
+// CHECK-SAME: known_grid_size = array<i32: 8, 12, 16>
 // CHECK-NEXT: %[[BID:.*]] = gpu.block_id x
 // CHECK-NEXT: = gpu.block_id y
 // CHECK-NEXT: = gpu.block_id z
@@ -108,8 +108,8 @@ llvm.func @launch_from_llvm_func() {
   // CHECK-NEXT: llvm.return
 
   // CHECK: gpu.func {{.*}} kernel attributes
-  // CHECK-SAME: gpu.known_block_size = array<i32: 1, 1, 1>
-  // CHECK-SAME: gpu.known_grid_size = array<i32: 1, 1, 1>
+  // CHECK-SAME: known_block_size = array<i32: 1, 1, 1>
+  // CHECK-SAME: known_grid_size = array<i32: 1, 1, 1>
   // CHECK: gpu.return
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %dim, %grid_y = %dim,
                                        %grid_z = %dim)
@@ -362,8 +362,8 @@ func.func @recursive_device_function() {
 
 // CHECK-LABEL: @non_constant_launches
 func.func @non_constant_launches(%arg0 : index) {
-  // CHECK-NOT: gpu.known_block_size
-  // CHECK-NOT: gpu.known_grid_size
+  // CHECK-NOT: known_block_size
+  // CHECK-NOT: known_grid_size
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %arg0, %grid_y = %arg0,
                                        %grid_z = %arg0)
              threads(%tx, %ty, %tz) in (%block_x = %arg0, %block_y = %arg0,
@@ -484,8 +484,8 @@ func.func @launch_cluster() {
 // CHECK-LABEL: gpu.module @launch_cluster_kernel
 // CHECK-NEXT: gpu.func @launch_cluster_kernel
 // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
-// CHECK-SAME: gpu.known_block_size = array<i32: 20, 24, 28>
-// CHECK-SAME: gpu.known_grid_size = array<i32: 8, 12, 16>
+// CHECK-SAME: known_block_size = array<i32: 20, 24, 28>
+// CHECK-SAME: known_grid_size = array<i32: 8, 12, 16>
 // CHECK-NEXT: %[[BID:.*]] = gpu.block_id x
 // CHECK-NEXT: = gpu.block_id y
 // CHECK-NEXT: = gpu.block_id z

>From dd44f393ad12ca5859662122d49e2ede7b24bd7e Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 14 Jun 2024 17:59:55 -0500
Subject: [PATCH 4/7] Use dialect helpers properly per review comment

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
 mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 7cf4604a3e20f..20ddeb53e4fbd 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -89,15 +89,12 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
   // Ensure we don't lose information if the function is lowered before its
   // surrounding context.
+  auto *gpuDialect = cast<GPUDialect>(gpuFuncOp.getDialect());
   if (knownBlockSize)
-    attributes.emplace_back(
-        rewriter.getStringAttr(
-            gpu::GPUDialect::KnownBlockSizeAttrHelper::getNameStr()),
+    attributes.emplace_back(gpuDialect.getKnownBlockSizeAttrHelper().getName()),
         knownBlockSize);
   if (knownGridSize)
-    attributes.emplace_back(
-        rewriter.getStringAttr(
-            gpu::GPUDialect::KnownGridSizeAttrHelper::getNameStr()),
+    attributes.emplace_back(gpuDialect.getKnownGridSizeAttrHelper().getName()),
         knownGridSize);
 
   // Add a dialect specific kernel attribute in addition to GPU kernel

>From 0209b83eb6d72b69643de0e7e904f709ed6df3ab Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 14 Jun 2024 23:58:38 +0000
Subject: [PATCH 5/7] Whoops, that one doesn't compile

---
 mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 20ddeb53e4fbd..052a48c5650f0 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -89,13 +89,13 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
   // Ensure we don't lose information if the function is lowered before its
   // surrounding context.
-  auto *gpuDialect = cast<GPUDialect>(gpuFuncOp.getDialect());
+  auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
   if (knownBlockSize)
-    attributes.emplace_back(gpuDialect.getKnownBlockSizeAttrHelper().getName()),
-        knownBlockSize);
+    attributes.emplace_back(gpuDialect->getKnownBlockSizeAttrHelper().getName(),
+                            knownBlockSize);
   if (knownGridSize)
-    attributes.emplace_back(gpuDialect.getKnownGridSizeAttrHelper().getName()),
-        knownGridSize);
+    attributes.emplace_back(gpuDialect->getKnownGridSizeAttrHelper().getName(),
+                            knownGridSize);
 
   // Add a dialect specific kernel attribute in addition to GPU kernel
   // attribute. The former is necessary for further translation while the

>From 2147e6146b85b441e9290d0aaf09dbe558b1ef8a Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Mon, 17 Jun 2024 17:02:53 +0000
Subject: [PATCH 6/7] Address review feedback

---
 mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 967452bb88173..40eb15a491063 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -351,6 +351,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
   using gpu::index_lowering::IndexKind;
   using gpu::index_lowering::IntrType;
   using mlir::gpu::amd::Runtime;
+  auto *rocdlDialect =
+      converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
   populateWithGenerated(patterns);
   patterns.add<
       gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
@@ -371,9 +373,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
       converter,
       /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
       /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
-      ROCDL::ROCDLDialect::KernelAttrHelper(&converter.getContext()).getName(),
-      ROCDL::ROCDLDialect::ReqdWorkGroupSizeAttrHelper(&converter.getContext())
-          .getName());
+      rocdlDialect->getKernelAttrHelper().getName(),
+      rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName());
   if (Runtime::HIP == runtime) {
     patterns.add<GPUPrintfOpToHIPLowering>(converter);
   } else if (Runtime::OpenCL == runtime) {

>From 268cd0b5e324997bb5f8f2e97ebfc5ca74ca7f0a Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <krzysdrewniak at gmail.com>
Date: Mon, 17 Jun 2024 20:39:03 -0700
Subject: [PATCH 7/7] Address stray llvm::

---
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index edba0290a4581..1a0eec0f289e5 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -311,7 +311,7 @@ void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
 
 static LogicalResult verifyKnownLaunchSizeAttr(Operation *op,
                                                NamedAttribute attr) {
-  auto array = llvm::dyn_cast<DenseI32ArrayAttr>(attr.getValue());
+  auto array = dyn_cast<DenseI32ArrayAttr>(attr.getValue());
   if (!array)
     return op->emitOpError(Twine(attr.getName()) +
                            " must be a dense i32 array");