[Mlir-commits] [mlir] 43fd4c4 - [mlir][GPU] Improve handling of GPU bounds (#95166)

Mon Jun 17 21:47:42 PDT 2024

Author: Krzysztof Drewniak
Date: 2024-06-17T23:47:38-05:00
New Revision: 43fd4c49bd8d54b9058620f0a885c7a5672fd602

URL: https://github.com/llvm/llvm-project/commit/43fd4c49bd8d54b9058620f0a885c7a5672fd602
DIFF: https://github.com/llvm/llvm-project/commit/43fd4c49bd8d54b9058620f0a885c7a5672fd602.diff

LOG: [mlir][GPU] Improve handling of GPU bounds (#95166)

This change reworks how range information for GPU dispatch IDs (block
IDs, thread IDs, and so on) is handled.

1. `known_block_size` and `known_grid_size` become inherent attributes
of GPU functions. This makes them less clunky to work with. As a
consequence, the `gpu.func` lowering patterns now only look at the
inherent attributes when setting target-specific attributes on the
`llvm.func` that they lower to.
2. At the same time, `gpu.known_block_size` and `gpu.known_grid_size`
are made official dialect-level discardable attributes which can be
placed on arbitrary functions. This allows for progressive lowerings
(without this, a lowering for `gpu.thread_id` couldn't know about the
bounds if it had already been moved from a `gpu.func` to an `llvm.func`)
and allows for range information to be provided even when
`gpu.*_{id,dim}` are being used outside of a `gpu.func` context.
3. All of these index operations have gained an optional `upper_bound`
attribute, allowing for an alternate mode of operation where the bounds
are specified locally and not inherited from the operation's context.
These also allow handling of cases where the precise launch sizes aren't
known, but can be bounded more precisely than the maximum of what any
platform's API allows. (I'd like to thank @benvanik for pointing out
that this could be useful.)

When inferring bounds (either for range inference or for setting `range`
during lowering) these sources of information are consulted in order of
specificity (`upper_bound` > inherent attribute > discardable attribute,
except that dimension sizes check for `known_*_bounds` to see if they
can be constant-folded before checking their `upper_bound`).

This patch also updates the documentation about the bounds and inference
behavior to clarify what these attributes do when set and the
consequences of setting them up incorrectly.

---------

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
    mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
    mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
    mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
    mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
    mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
    mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
    mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
    mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
    mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
    mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
    mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
    mlir/test/Dialect/GPU/int-range-interface.mlir
    mlir/test/Dialect/GPU/invalid.mlir
    mlir/test/Dialect/GPU/outlining.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index 7b9d46fda12f5..860f893367203 100644

--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -62,6 +62,11 @@ def GPU_Dialect : Dialect {
     static bool isWorkgroupMemoryAddressSpace(Attribute memorySpace);
   }];
 
+  let discardableAttrs = (ins
+    "::mlir::DenseI32ArrayAttr":$known_block_size,
+    "::mlir::DenseI32ArrayAttr":$known_grid_size
+  );
+
   let dependentDialects = ["arith::ArithDialect"];
   let useDefaultAttributePrinterParser = 1;
   let useDefaultTypePrinterParser = 1;

diff  --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 5943de103beba..f47d4073ce842 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -54,8 +54,9 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
         Pure,
         DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
         DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>])>,
-    Arguments<(ins GPU_DimensionAttr:$dimension)>, Results<(outs Index)> {
-  let assemblyFormat = "$dimension attr-dict";
+    Arguments<(ins GPU_DimensionAttr:$dimension,
+                   OptionalAttr<IndexAttr>:$upper_bound)>, Results<(outs Index)> {
+  let assemblyFormat = "$dimension (`upper_bound` $upper_bound^)? attr-dict";
   let extraClassDefinition = [{
     void $cppClass::getAsmResultNames(
         llvm::function_ref<void(mlir::Value, mlir::StringRef)> setNameFn) {
@@ -66,6 +67,14 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
       setNameFn(getResult(),resultName);
     }
   }];
+  let builders = [
+    OpBuilder<(ins "::mlir::gpu::Dimension":$dimension), [{
+      build($_builder, $_state, dimension, /*upperBound=*/nullptr);
+    }]>,
+    OpBuilder<(ins "::mlir::Type":$resultType, "::mlir::gpu::Dimension":$dimension), [{
+      build($_builder, $_state, resultType, dimension, /*upperBound=*/nullptr);
+    }]>
+  ];
 }
 
 def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
@@ -78,6 +87,12 @@ def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
     ```mlir
     %cDimX = gpu.cluster_dim x
     ```
+
+    If `upper_bound` is set, then executing (a lowering of) this operation in an
+    environment where the clusters per grid is greater than `upper_bound` causes
+    undefined behavior.
+
+    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
   }];
 }
 
@@ -91,6 +106,12 @@ def GPU_ClusterDimBlocksOp : GPU_IndexOp<"cluster_dim_blocks"> {
     ```mlir
     %cDimBlocksX = gpu.cluster_dim_blocks x
     ```
+
+    If `upper_bound` is set, then executing (a lowering of) this operation in an
+    environment where the thread blocks per cluster  is greater than `upper_bound`
+    causes undefined behavior.
+
+    There is an implicit upper bound of `kMaxClusterDim` (currently 8).
   }];
 }
 
@@ -104,6 +125,12 @@ def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> {
     ```mlir
     %cIdY = gpu.cluster_id y
     ```
+
+    If `upper_bound` is set, then executing (a lowering of) this operation in an
+    environment where the number of clusters in the grid along `dimension` is
+    greater than `upper_bound` causes undefined behavior.
+
+    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
   }];
 }
 
@@ -116,6 +143,12 @@ def GPU_ClusterBlockIdOp : GPU_IndexOp<"cluster_block_id"> {
     ```mlir
     %cBlockIdY = gpu.cluster_block_id y
     ```
+
+    If `upper_bound` is set, then executing (a lowering of) this operation in an
+    environment where the number of thread blocks per cluster  along `dimension`
+    is greater than `upper_bound` causes undefined behavior.
+
+    There is an implicit upper bound of `kMaxClusterDim` (currently 8).
   }];
 }
 
@@ -129,6 +162,19 @@ def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> {
     ```mlir
     %bDimX = gpu.block_dim x
     ```
+
+    If `known_block_size` is set on an this operation's enclosing `gpu.func`,
+    or `gpu.known_block_size` is set on an enclosing `FunctionOpInterface`
+    implementor, or if the enclosing `gpu.launch` specifies a constant size for
+    `dimension`'s blocks, these contextual facts may be used to infer that this
+    operation has a constant value, though such a transformation will not be
+    performed by canonicalization or the default constant folder. Executions which
+    cause that constant-value assumption to be false incur undefined behavior.
+
+    If `upper_bound` is set, executions where the bblock size along `dimension`
+    exceeds `upper_bound` cause undefined behavior.
+
+    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
   }];
 }
 def GPU_BlockIdOp : GPU_IndexOp<"block_id"> {
@@ -141,6 +187,13 @@ def GPU_BlockIdOp : GPU_IndexOp<"block_id"> {
     ```mlir
     %bIdY = gpu.block_id y
     ```
+
+    If `upper_bound` is set, or if one can be inferred from `known_grid_size`-type
+    annotations in context, executions where the block index in `dimension` would
+    be greater than or equal to that bound cause undefined behavior. `upper_bound`
+    takes priority over bounds inferrable from context.
+
+    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
   }];
 }
 def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> {
@@ -153,6 +206,20 @@ def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> {
     ```mlir
     %gDimZ = gpu.grid_dim z
     ```
+
+
+    If `known_grid_size` is set on an this operation's enclosing `gpu.func`,
+    or `gpu.known_grid_size` is set on an enclosing `FunctionOpInterface`
+    implementor, or if the enclosing `gpu.launch` specifies a constant size for
+    `dimension`'s grid length, these contextual facts may be used to infer that this
+    operation has a constant value, though such a transformation will not be
+    performed by canonicalization or the default constant folder. Executions which
+    cause that constant-value assumption to be false incur undefined behavior.
+
+    If `upper_bound` is set, executions where the grid size in `dimension` would
+    exceed `upper_bound` cause undefined behavior.
+
+    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
   }];
 }
 def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
@@ -165,6 +232,12 @@ def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
     ```mlir
     %tIdX = gpu.thread_id x
     ```
+
+    If `upper_bound` is set, or if one can be inferred from `known_block_size`-type
+    annotations in context, executions where the thread index would be greater
+    than or equal to that bound cause undefined behavior.
+
+    There is an implicit upper bound of `kMaxDim` (currently uint32_t::max).
   }];
 }
 
@@ -177,14 +250,21 @@ def GPU_LaneIdOp : GPU_Op<"lane_id", [
     ```mlir
     %laneId = gpu.lane_id
     ```
+
+    If `upper_bound` is set, executions with more than `upper_bound` lanes per
+    subgroup cause undefined behavior. In the abscence of `upper_bound`,
+    the lane id is still assumed to be non-negative and less than the
+    target-independent `kMaxSubgroupSize` (currently 128).
   }];
+  let arguments = (ins OptionalAttr<IndexAttr>:$upper_bound);
   let results = (outs Index:$result);
-  let assemblyFormat = "attr-dict";
+  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict";
 }
 
 def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
       Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
-    Arguments<(ins)>, Results<(outs Index:$result)> {
+    Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
+    Results<(outs Index:$result)> {
   let description = [{
     Returns the subgroup id, i.e., the index of the current subgroup within the
     workgroup.
@@ -194,9 +274,13 @@ def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [
     ```mlir
     %sgId = gpu.subgroup_id : index
     ```
+
+    Executions where there are more than `upper_bound` subgroups per workgroup
+    cause undefined behavior. There is an implicit upper bound of `kMaxDim`
+    (currently uint32_t::max).
   }];
 
-  let assemblyFormat = "attr-dict `:` type($result)";
+  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
 }
 
 def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
@@ -209,14 +293,20 @@ def GPU_GlobalIdOp : GPU_IndexOp<"global_id"> {
 
     ```mlir
     %gidX = gpu.global_id x
+    %gidX = gpu.global_id x upper_bound 65536
     ```
+
+    The `upper_bound` attribute defines an upper bound analogously to the ones on
+    `thread_id` and `block_id`. If one is not set, the bound may be inferred from
+    a combination of `known_block_size` and `known_grid_size`-type annotations.
   }];
 }
 
 
 def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
       Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
-    Arguments<(ins)>, Results<(outs Index:$result)> {
+    Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
+    Results<(outs Index:$result)> {
   let description = [{
     Returns the number of subgroups within a workgroup.
 
@@ -225,14 +315,19 @@ def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [
     ```mlir
     %numSg = gpu.num_subgroups : index
     ```
+
+    If `upper_bound` is set, executions with more than `upper_bound` subgroups
+    per workgroup cause undefined behavior. There is a default upper bound of
+    `kMaxDim` (currently uint32_t::max).
   }];
 
-  let assemblyFormat = "attr-dict `:` type($result)";
+  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
 }
 
 def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
       Pure, DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
-    Arguments<(ins)>, Results<(outs Index:$result)> {
+    Arguments<(ins OptionalAttr<IndexAttr>:$upper_bound)>,
+    Results<(outs Index:$result)> {
   let description = [{
     Returns the number of threads within a subgroup.
 
@@ -241,11 +336,20 @@ def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [
     ```mlir
     %sgSz = gpu.subgroup_size : index
     ```
+
+    Executions where the number of threads per subgroup exceed `upper_bound` cause
+    undefined behavior. When no `upper_bound` is specified, range analyses and
+    similar machinery assume the default bound of `kMaxSubgroupSize`, currently
+    128.
   }];
 
-  let assemblyFormat = "attr-dict `:` type($result)";
+  let assemblyFormat = "(`upper_bound` $upper_bound^)? attr-dict `:` type($result)";
 }
 
+def GPU_OptionalDimSizeHintAttr : ConfinedAttr<OptionalAttr<DenseI32ArrayAttr>,
+  [AttrConstraint<Or<[IsNullAttr.predicate, DenseArrayCount<3>.predicate]>,
+    "with 3 elements (if present)">]>;
+
 def GPU_GPUFuncOp : GPU_Op<"func", [
     HasParent<"GPUModuleOp">, AutomaticAllocationScope, FunctionOpInterface,
     IsolatedFromAbove
@@ -274,12 +378,14 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
     body region, are not supported.
 
     A function may optionally be annotated with the block and/or grid sizes
-    that will be used when it is launched using the `gpu.known_block_size` and
-    `gpu.known_grid_size` attributes, respectively. If set, these attributes must
+    that will be used when it is launched using the `known_block_size` and
+    `known_grid_size` attributes, respectively. If set, these attributes must
     be arrays of three 32-bit integers giving the x, y, and z launch dimensions.
     Launching a kernel that has these annotations, or that calls a function with
     these annotations, using a block size or grid size other than what is specified
-    is undefined behavior.
+    is undefined behavior. These attributes may be set on non-`gpu.func` functions
+    by using `gpu.known_block_size` or `gpu.known_grid_size`, but this carries
+    the risk that they will de discarded.
 
     Syntax:
 
@@ -322,7 +428,9 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
                        OptionalAttr<DictArrayAttr>:$arg_attrs,
                        OptionalAttr<DictArrayAttr>:$res_attrs,
                        OptionalAttr<DictArrayAttr>:$workgroup_attrib_attrs,
-                       OptionalAttr<DictArrayAttr>:$private_attrib_attrs);
+                       OptionalAttr<DictArrayAttr>:$private_attrib_attrs,
+                       GPU_OptionalDimSizeHintAttr:$known_block_size,
+                       GPU_OptionalDimSizeHintAttr:$known_grid_size);
   let regions = (region AnyRegion:$body);
 
   let skipDefaultBuilders = 1;
@@ -445,36 +553,6 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
       return "workgroup_attributions";
     }
 
-    static constexpr StringLiteral getKnownBlockSizeAttrName() {
-      return StringLiteral("gpu.known_block_size");
-    }
-
-    static constexpr StringLiteral getKnownGridSizeAttrName() {
-      return StringLiteral("gpu.known_grid_size");
-    }
-
-    /// Returns the block size this kernel will be launched with along
-    /// dimension `dim` if known. The value of gpu.thread_id dim will be strictly
-    /// less than this size.
-    std::optional<uint32_t> getKnownBlockSize(gpu::Dimension dim) {
-      if (auto array =
-        (*this)->getAttrOfType<DenseI32ArrayAttr>(getKnownBlockSizeAttrName())) {
-        return array[static_cast<uint32_t>(dim)];
-      }
-      return std::nullopt;
-    }
-
-    /// Returns the grid size this kernel will be launched with along
-    /// dimension `dim` if known. The value of gpu.block_id dim will be strictly
-    /// less than this size.
-    std::optional<uint32_t> getKnownGridSize(gpu::Dimension dim) {
-      if (auto array =
-        (*this)->getAttrOfType<DenseI32ArrayAttr>(getKnownGridSizeAttrName())) {
-        return array[static_cast<uint32_t>(dim)];
-      }
-      return std::nullopt;
-    }
-
     /// Returns the argument types of this function.
     ArrayRef<Type> getArgumentTypes() { return getFunctionType().getInputs(); }
 
@@ -495,8 +573,6 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
     LogicalResult verifyBody();
   }];
   let hasCustomAssemblyFormat = 1;
-
-  let hasVerifier = 1;
 }
 
 def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
@@ -723,8 +799,8 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
                Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
-               Optional<Index>:$clusterSizeX, 
-               Optional<Index>:$clusterSizeY, 
+               Optional<Index>:$clusterSizeX,
+               Optional<Index>:$clusterSizeY,
                Optional<Index>:$clusterSizeZ,
                Optional<I32>:$dynamicSharedMemorySize)>,
     Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
@@ -748,7 +824,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     to the amount of dynamic shared memory a kernel's workgroup should be
     allocated; when this operand is not present, a zero size is assumed.
 
-    The body region has at least _twelve_ arguments, or _eighteen_ if cluster 
+    The body region has at least _twelve_ arguments, or _eighteen_ if cluster
     dimensions are present, grouped as follows:
 
     -   three optional arguments that contain cluster identifiers along x,y,z
@@ -821,7 +897,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
                blocks(%bx, %by, %bz) in (%sz_bx = %3, %sz_by = %4, %sz_bz = %5)
                threads(%tx, %ty, %tz) in (%sz_tx = %6, %sz_ty = %7, %sz_tz = %8)
     {
-      // Cluster, block and thread identifiers, as well as cluster/block/grid 
+      // Cluster, block and thread identifiers, as well as cluster/block/grid
       // sizes are immediately usable inside body region.
       "some_op"(%cx, %bx, %tx) : (index, index, index) -> ()
     }
@@ -898,7 +974,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
     unsigned getNumConfigOperands() {
       return kNumConfigOperands + (hasClusterSize() ? 3 : 0);
     }
-    /// Returns the number of region attributes including cluster size 
+    /// Returns the number of region attributes including cluster size
     unsigned getNumConfigRegionAttributes() {
       return kNumConfigRegionAttributes + (hasClusterSize() ? 6 : 0);
     }

diff  --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 36e10372e4bc5..052a48c5650f0 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -74,7 +74,9 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
         attr.getName() ==
             gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName() ||
         attr.getName() == gpuFuncOp.getWorkgroupAttribAttrsAttrName() ||
-        attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName())
+        attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName() ||
+        attr.getName() == gpuFuncOp.getKnownBlockSizeAttrName() ||
+        attr.getName() == gpuFuncOp.getKnownGridSizeAttrName())
       continue;
     if (attr.getName() == gpuFuncOp.getArgAttrsAttrName()) {
       argAttrs = gpuFuncOp.getArgAttrsAttr();
@@ -82,27 +84,28 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
     }
     attributes.push_back(attr);
   }
+
+  DenseI32ArrayAttr knownBlockSize = gpuFuncOp.getKnownBlockSizeAttr();
+  DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
+  // Ensure we don't lose information if the function is lowered before its
+  // surrounding context.
+  auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
+  if (knownBlockSize)
+    attributes.emplace_back(gpuDialect->getKnownBlockSizeAttrHelper().getName(),
+                            knownBlockSize);
+  if (knownGridSize)
+    attributes.emplace_back(gpuDialect->getKnownGridSizeAttrHelper().getName(),
+                            knownGridSize);
+
   // Add a dialect specific kernel attribute in addition to GPU kernel
   // attribute. The former is necessary for further translation while the
   // latter is expected by gpu.launch_func.
   if (gpuFuncOp.isKernel()) {
     attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
-
-    // Set the block size attribute if it is present.
-    if (kernelBlockSizeAttributeName.has_value()) {
-      std::optional<int32_t> dimX =
-          gpuFuncOp.getKnownBlockSize(gpu::Dimension::x);
-      std::optional<int32_t> dimY =
-          gpuFuncOp.getKnownBlockSize(gpu::Dimension::y);
-      std::optional<int32_t> dimZ =
-          gpuFuncOp.getKnownBlockSize(gpu::Dimension::z);
-      if (dimX.has_value() || dimY.has_value() || dimZ.has_value()) {
-        // If any of the dimensions are missing, fill them in with 1.
-        attributes.emplace_back(
-            kernelBlockSizeAttributeName.value(),
-            rewriter.getDenseI32ArrayAttr(
-                {dimX.value_or(1), dimY.value_or(1), dimZ.value_or(1)}));
-      }
+    // Set the dialect-specific block size attribute if there is one.
+    if (kernelBlockSizeAttributeName.has_value() && knownBlockSize) {
+      attributes.emplace_back(kernelBlockSizeAttributeName.value(),
+                              knownBlockSize);
     }
   }
   auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(

diff  --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index d067c70a90ea4..e4cd24e0380e7 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -14,6 +14,14 @@
 #include "mlir/IR/BuiltinAttributes.h"
 
 namespace mlir {
+namespace gpu {
+namespace index_lowering {
+enum class IndexKind : uint32_t { Other = 0, Block = 1, Grid = 2 };
+enum class IntrType : uint32_t {
+  None = 0,
+  Id = 1,
+  Dim = 2,
+};
 
 // Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension
 // that Op operates on.  Op is assumed to return an `index` value and
@@ -21,22 +29,23 @@ namespace mlir {
 // `indexBitwidth`, sign-extend or truncate the resulting value to match the
 // bitwidth expected by the consumers of the value.
 template <typename Op, typename XOp, typename YOp, typename ZOp>
-struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {
+struct OpLowering : public ConvertOpToLLVMPattern<Op> {
 private:
   unsigned indexBitwidth;
-  StringRef boundsAttrName;
+  IndexKind indexKind;
+  IntrType intrType;
 
 public:
-  explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter)
+  explicit OpLowering(LLVMTypeConverter &typeConverter)
       : ConvertOpToLLVMPattern<Op>(typeConverter),
         indexBitwidth(typeConverter.getIndexTypeBitwidth()),
-        boundsAttrName("") {}
+        indexKind(IndexKind::Other), intrType(IntrType::None) {}
 
-  explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter,
-                                       StringRef boundsAttrName)
+  explicit OpLowering(LLVMTypeConverter &typeConverter, IndexKind indexKind,
+                      IntrType intrType)
       : ConvertOpToLLVMPattern<Op>(typeConverter),
         indexBitwidth(typeConverter.getIndexTypeBitwidth()),
-        boundsAttrName(boundsAttrName) {}
+        indexKind(indexKind), intrType(intrType) {}
 
   // Convert the kernel arguments to an LLVM type, preserve the rest.
   LogicalResult
@@ -57,19 +66,58 @@ struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {
       break;
     }
 
-    Operation *function;
-    if (auto gpuFunc = op->template getParentOfType<gpu::GPUFuncOp>())
-      function = gpuFunc;
-    if (auto llvmFunc = op->template getParentOfType<LLVM::LLVMFuncOp>())
-      function = llvmFunc;
-    if (!boundsAttrName.empty() && function) {
-      if (auto attr = function->template getAttrOfType<DenseI32ArrayAttr>(
-              boundsAttrName)) {
-        int32_t maximum = attr[static_cast<uint32_t>(op.getDimension())];
-        newOp->setAttr("range", rewriter.getDenseI32ArrayAttr({0, maximum}));
+    // Order of priority for bounds:
+    // 1. The upper_bound attribute
+    // 2. Inherent attributes on a surrounding gpu.func
+    // 3. Discardable attributes on a surrounding function of any kind
+    // The below code handles these in reverse order so that more important
+    // sources overwrite less important ones.
+    DenseI32ArrayAttr funcBounds = nullptr;
+    if (auto funcOp = op->template getParentOfType<FunctionOpInterface>()) {
+      switch (indexKind) {
+      case IndexKind::Block: {
+        auto blockHelper =
+            gpu::GPUDialect::KnownBlockSizeAttrHelper(op.getContext());
+        if (blockHelper.isAttrPresent(funcOp))
+          funcBounds = blockHelper.getAttr(funcOp);
+        break;
+      }
+      case IndexKind::Grid: {
+        auto gridHelper =
+            gpu::GPUDialect::KnownGridSizeAttrHelper(op.getContext());
+        if (gridHelper.isAttrPresent(funcOp))
+          funcBounds = gridHelper.getAttr(funcOp);
+        break;
+      }
+      case IndexKind::Other:
+        break;
       }
     }
+    if (auto gpuFunc = op->template getParentOfType<gpu::GPUFuncOp>()) {
+      switch (indexKind) {
+      case IndexKind::Block:
+        funcBounds = gpuFunc.getKnownBlockSizeAttr();
+        break;
+      case IndexKind::Grid:
+        funcBounds = gpuFunc.getKnownGridSizeAttr();
+        break;
+      case IndexKind::Other:
+        break;
+      }
+    }
+    std::optional<int32_t> upperBound;
+    if (funcBounds)
+      upperBound =
+          funcBounds.asArrayRef()[static_cast<uint32_t>(op.getDimension())];
+    if (auto opBound = op.getUpperBound())
+      upperBound = opBound->getZExtValue();
 
+    if (upperBound && intrType != IntrType::None) {
+      int32_t min = (intrType == IntrType::Dim ? 1 : 0);
+      int32_t max = *upperBound - (intrType == IntrType::Id ? 0 : 1);
+      newOp->setAttr(
+          "range", DenseI32ArrayAttr::get(op.getContext(), ArrayRef{min, max}));
+    }
     if (indexBitwidth > 32) {
       newOp = rewriter.create<LLVM::SExtOp>(
           loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0));
@@ -82,7 +130,8 @@ struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {
     return success();
   }
 };
-
+} // namespace index_lowering
+} // namespace gpu
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_

diff  --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index fdd65e40e9064..fea8a0ddc7f06 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -336,24 +336,23 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
   populateWithGenerated(patterns);
   patterns.add<GPUPrintfOpToVPrintfLowering>(converter);
   patterns.add<
-      GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
-                                  NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
-      GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
-                                  NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
-      GPUIndexIntrinsicOpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
-                                  NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
-      GPUIndexIntrinsicOpLowering<
+      gpu::index_lowering::OpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
+                                      NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
+      gpu::index_lowering::OpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
+                                      NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
+      gpu::index_lowering::OpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
+                                      NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
+      gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
+                                      NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
+      gpu::index_lowering::OpLowering<
           gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
           NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>,
-      GPUIndexIntrinsicOpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
-                                  NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
-      GPUIndexIntrinsicOpLowering<
-          gpu::ClusterDimBlocksOp, NVVM::ClusterDimBlocksXOp,
-          NVVM::ClusterDimBlocksYOp, NVVM::ClusterDimBlocksZOp>,
-      GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
-                                  NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
-      GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
-                                  NVVM::GridDimYOp, NVVM::GridDimZOp>,
+      gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
+                                      NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
+      gpu::index_lowering::OpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
+                                      NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
+      gpu::index_lowering::OpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
+                                      NVVM::GridDimYOp, NVVM::GridDimZOp>,
       GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(converter);
 
   patterns.add<GPUDynamicSharedMemoryOpLowering>(

diff  --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 70dcccf0a7307..40eb15a491063 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -305,9 +305,8 @@ struct LowerGpuOpsToROCDLOpsPass
     // Manually rewrite known block size attributes so the LLVMIR translation
     // infrastructure can pick them up.
     m.walk([&](LLVM::LLVMFuncOp op) {
-      if (auto blockSizes = dyn_cast_or_null<DenseI32ArrayAttr>(
-              op->removeAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName()))) {
-        reqdWorkGroupSizeAttrHelper.setAttr(op, blockSizes);
+      if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
+        auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
         // Also set up the rocdl.flat_work_group_size attribute to prevent
         // conflicting metadata.
         uint32_t flatSize = 1;
@@ -349,27 +348,33 @@ static void populateOpPatterns(LLVMTypeConverter &converter,
 void mlir::populateGpuToROCDLConversionPatterns(
     LLVMTypeConverter &converter, RewritePatternSet &patterns,
     mlir::gpu::amd::Runtime runtime) {
+  using gpu::index_lowering::IndexKind;
+  using gpu::index_lowering::IntrType;
   using mlir::gpu::amd::Runtime;
-
+  auto *rocdlDialect =
+      converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
   populateWithGenerated(patterns);
-  patterns
-      .add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
-                                       ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
-          converter, gpu::GPUFuncOp::getKnownBlockSizeAttrName());
-  patterns.add<GPUIndexIntrinsicOpLowering<
+  patterns.add<
+      gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
+                                      ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
+      converter, IndexKind::Block, IntrType::Id);
+  patterns.add<gpu::index_lowering::OpLowering<
       gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
-      converter, gpu::GPUFuncOp::getKnownGridSizeAttrName());
-  patterns
-      .add<GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
-                                       ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
-           GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
-                                       ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
-           GPUReturnOpLowering>(converter);
+      converter, IndexKind::Grid, IntrType::Id);
+  patterns.add<
+      gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
+                                      ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
+      converter, IndexKind::Block, IntrType::Dim);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
+      converter, IndexKind::Grid, IntrType::Dim);
+  patterns.add<GPUReturnOpLowering>(converter);
   patterns.add<GPUFuncOpLowering>(
       converter,
       /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
       /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
-      ROCDL::ROCDLDialect::KernelAttrHelper(&converter.getContext()).getName());
+      rocdlDialect->getKernelAttrHelper().getName(),
+      rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName());
   if (Runtime::HIP == runtime) {
     patterns.add<GPUPrintfOpToHIPLowering>(converter);
   } else if (Runtime::OpenCL == runtime) {

diff  --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index 4496c2bc5fe8b..0fd91b27b7d21 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -743,7 +743,7 @@ creatLdMatrixCompatibleLoads(RewriterBase &rewriter, vector::TransferReadOp op,
   }
 
   // Adjust the load offset.
-  auto laneId = rewriter.create<gpu::LaneIdOp>(loc);
+  auto laneId = rewriter.create<gpu::LaneIdOp>(loc, /*upperBound=*/nullptr);
   FailureOr<AffineMap> offsets =
       nvgpu::getLaneIdToLdMatrixMatrixCoord(rewriter, loc, *params);
   if (failed(offsets)) {
@@ -782,7 +782,7 @@ createNonLdMatrixLoads(RewriterBase &rewriter, vector::TransferReadOp op,
             "conversion to distributed non-ldmatrix compatible load");
   }
 
-  Value laneId = rewriter.create<gpu::LaneIdOp>(loc);
+  Value laneId = rewriter.create<gpu::LaneIdOp>(loc, /*upperBound=*/nullptr);
   SmallVector<Value, 4> elements;
 
   // This is the individual element type.
@@ -917,7 +917,7 @@ convertTransferWriteToStores(RewriterBase &rewriter, vector::TransferWriteOp op,
     return rewriter.notifyMatchFailure(op, "not mma sync reg info");
 
   VectorType vectorType = getMmaSyncVectorOperandType(*regInfo);
-  Value laneId = rewriter.create<gpu::LaneIdOp>(loc);
+  Value laneId = rewriter.create<gpu::LaneIdOp>(loc, /*upperBound=*/nullptr);
 
   for (unsigned i = 0; i < vectorType.getShape()[0]; i++) {
     Value logicalValueId = rewriter.create<arith::ConstantOp>(

diff  --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 17bf254b64b4d..3abaa3b3a81dd 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -309,8 +309,24 @@ void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
       .Default([](Type) { llvm_unreachable("unexpected 'gpu' type kind"); });
 }
 
+static LogicalResult verifyKnownLaunchSizeAttr(Operation *op,
+                                               NamedAttribute attr) {
+  auto array = dyn_cast<DenseI32ArrayAttr>(attr.getValue());
+  if (!array)
+    return op->emitOpError(Twine(attr.getName()) +
+                           " must be a dense i32 array");
+  if (array.size() != 3)
+    return op->emitOpError(Twine(attr.getName()) +
+                           " must contain exactly 3 elements");
+  return success();
+}
+
 LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
                                                    NamedAttribute attr) {
+  if (attr.getName() == getKnownBlockSizeAttrHelper().getName())
+    return verifyKnownLaunchSizeAttr(op, attr);
+  if (attr.getName() == getKnownGridSizeAttrHelper().getName())
+    return verifyKnownLaunchSizeAttr(op, attr);
   if (!llvm::isa<UnitAttr>(attr.getValue()) ||
       attr.getName() != getContainerModuleAttrName())
     return success();
@@ -1689,27 +1705,6 @@ LogicalResult GPUFuncOp::verifyBody() {
   return success();
 }
 
-static LogicalResult verifyKnownLaunchSizeAttr(gpu::GPUFuncOp op,
-                                               StringRef attrName) {
-  auto maybeAttr = op->getAttr(attrName);
-  if (!maybeAttr)
-    return success();
-  auto array = llvm::dyn_cast<DenseI32ArrayAttr>(maybeAttr);
-  if (!array)
-    return op.emitOpError(attrName + " must be a dense i32 array");
-  if (array.size() != 3)
-    return op.emitOpError(attrName + " must contain exactly 3 elements");
-  return success();
-}
-
-LogicalResult GPUFuncOp::verify() {
-  if (failed(verifyKnownLaunchSizeAttr(*this, getKnownBlockSizeAttrName())))
-    return failure();
-  if (failed(verifyKnownLaunchSizeAttr(*this, getKnownGridSizeAttrName())))
-    return failure();
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // ReturnOp
 //===----------------------------------------------------------------------===//

diff  --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index 46b85db8b5431..f5e30a278f06b 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -54,6 +55,35 @@ static Value valueByDim(KernelDim3 dims, Dimension dim) {
 
 static uint64_t zext(uint32_t arg) { return static_cast<uint64_t>(arg); }
 
+static std::optional<uint64_t>
+getKnownLaunchAttr(GPUFuncOp func, LaunchDims dims, Dimension dim) {
+  DenseI32ArrayAttr bounds;
+  switch (dims) {
+  case LaunchDims::Block:
+    bounds = func.getKnownBlockSizeAttr();
+    break;
+  case LaunchDims::Grid:
+    bounds = func.getKnownGridSizeAttr();
+    break;
+  }
+  if (!bounds)
+    return std::nullopt;
+  if (bounds.size() < static_cast<uint32_t>(dim))
+    return std::nullopt;
+  return zext(bounds[static_cast<uint32_t>(dim)]);
+}
+
+static std::optional<uint64_t> getKnownLaunchAttr(FunctionOpInterface func,
+                                                  StringRef attrName,
+                                                  Dimension dim) {
+  auto bounds = func.getOperation()->getAttrOfType<DenseI32ArrayAttr>(attrName);
+  if (!bounds)
+    return std::nullopt;
+  if (bounds.size() < static_cast<uint32_t>(dim))
+    return std::nullopt;
+  return zext(bounds[static_cast<uint32_t>(dim)]);
+}
+
 template <typename Op>
 static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
   Dimension dim = op.getDimension();
@@ -73,37 +103,57 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
       return value.getZExtValue();
   }
 
-  if (auto func = op->template getParentOfType<GPUFuncOp>()) {
+  if (auto gpuFunc = op->template getParentOfType<GPUFuncOp>()) {
+    auto inherentAttr = getKnownLaunchAttr(gpuFunc, type, dim);
+    if (inherentAttr)
+      return inherentAttr;
+  }
+  if (auto func = op->template getParentOfType<FunctionOpInterface>()) {
+    StringRef attrName;
     switch (type) {
     case LaunchDims::Block:
-      return llvm::transformOptional(func.getKnownBlockSize(dim), zext);
+      attrName = GPUDialect::KnownBlockSizeAttrHelper::getNameStr();
+      break;
     case LaunchDims::Grid:
-      return llvm::transformOptional(func.getKnownGridSize(dim), zext);
+      attrName = GPUDialect::KnownGridSizeAttrHelper::getNameStr();
+      break;
     }
+    auto discardableAttr = getKnownLaunchAttr(func, attrName, dim);
+    if (discardableAttr)
+      return discardableAttr;
   }
   return std::nullopt;
 }
 
 void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                      SetIntRangeFn setResultRange) {
-  uint64_t max = APInt::getMaxValue(64).getZExtValue();
+  uint64_t max = kMaxDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(1, max));
 }
 
 void ClusterDimBlocksOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                            SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), getIndexRange(1, kMaxClusterDim));
+  uint64_t max = kMaxClusterDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(1, max));
 }
 
 void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                     SetIntRangeFn setResultRange) {
-  uint64_t max = kMaxClusterDim;
+  uint64_t max = kMaxDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
 void ClusterBlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                          SetIntRangeFn setResultRange) {
   uint64_t max = kMaxClusterDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
@@ -112,14 +162,21 @@ void BlockDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
   std::optional<uint64_t> knownVal =
       getKnownLaunchDim(*this, LaunchDims::Block);
   if (knownVal)
-    setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
-  else
-    setResultRange(getResult(), getIndexRange(1, kMaxDim));
+    return setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
+  ;
+  uint64_t max = kMaxDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(1, max));
 }
 
 void BlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                   SetIntRangeFn setResultRange) {
-  uint64_t max = getKnownLaunchDim(*this, LaunchDims::Grid).value_or(kMaxDim);
+  uint64_t max = kMaxDim;
+  if (auto fromContext = getKnownLaunchDim(*this, LaunchDims::Grid))
+    max = fromContext.value();
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
@@ -127,29 +184,45 @@ void GridDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                   SetIntRangeFn setResultRange) {
   std::optional<uint64_t> knownVal = getKnownLaunchDim(*this, LaunchDims::Grid);
   if (knownVal)
-    setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
-  else
-    setResultRange(getResult(), getIndexRange(1, kMaxDim));
+    return setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
+  uint64_t max = kMaxDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(1, max));
 }
 
 void ThreadIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                    SetIntRangeFn setResultRange) {
-  uint64_t max = getKnownLaunchDim(*this, LaunchDims::Block).value_or(kMaxDim);
+  uint64_t max = kMaxDim;
+  if (auto fromContext = getKnownLaunchDim(*this, LaunchDims::Block))
+    max = fromContext.value();
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
 void LaneIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                  SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), getIndexRange(0, kMaxSubgroupSize - 1ULL));
+  uint64_t max = kMaxSubgroupSize;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
 void SubgroupIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                      SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), getIndexRange(0, kMaxDim - 1ULL));
+  uint64_t max = kMaxDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
 void GlobalIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                    SetIntRangeFn setResultRange) {
+  if (auto specified = getUpperBound())
+    return setResultRange(getResult(),
+                          getIndexRange(0, specified->getZExtValue() - 1ULL));
+
   uint64_t blockDimMax =
       getKnownLaunchDim(*this, LaunchDims::Block).value_or(kMaxDim);
   uint64_t gridDimMax =
@@ -160,12 +233,18 @@ void GlobalIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 
 void NumSubgroupsOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                        SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), getIndexRange(1, kMaxDim));
+  uint64_t max = kMaxDim;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(1, max));
 }
 
 void SubgroupSizeOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                        SetIntRangeFn setResultRange) {
-  setResultRange(getResult(), getIndexRange(1, kMaxSubgroupSize));
+  uint64_t max = kMaxSubgroupSize;
+  if (auto specified = getUpperBound())
+    max = specified->getZExtValue();
+  setResultRange(getResult(), getIndexRange(1, max));
 }
 
 void LaunchOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,

diff  --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index f5e80553ae72a..5f6556d915f41 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -208,12 +208,10 @@ static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp,
   // because multiple launches with the same body are not deduplicated.
   if (auto blockBounds =
           maybeConstantDimsAttr(launchOp.getBlockSizeOperandValues()))
-    outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName(),
-                          blockBounds);
+    outlinedFunc.setKnownBlockSizeAttr(blockBounds);
   if (auto gridBounds =
           maybeConstantDimsAttr(launchOp.getGridSizeOperandValues()))
-    outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownGridSizeAttrName(),
-                          gridBounds);
+    outlinedFunc.setKnownGridSizeAttr(gridBounds);
 
   IRMapping map;
 

diff  --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index cd701095d8e64..c57cfd2977836 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -672,8 +672,8 @@ gpu.module @test_module_32 {
 
 gpu.module @gpumodule {
 // CHECK-LABEL: func @kernel_with_block_size()
-// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 128, 1, 1>} 
-  gpu.func @kernel_with_block_size() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
+// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 128, 1, 1>}
+  gpu.func @kernel_with_block_size() kernel attributes {known_block_size = array<i32: 128, 1, 1>} {
     gpu.return
   }
 }

diff  --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index a8d61a6a0f6fd..bf49a42a11577 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -70,13 +70,12 @@ gpu.module @test_module {
 // -----
 
 gpu.module @test_module {
-  // CHECK-LABEL: func @gpu_index_ops_range()
+  // CHECK-LABEL: func @gpu_index_ops_range
   // CHECK-SAME: rocdl.flat_work_group_size = "1536,1536"
   // CHECK-SAME: rocdl.reqd_work_group_size = array<i32: 8, 12, 16>
-  func.func @gpu_index_ops_range()
-      -> (index, index, index, index, index, index) attributes
-      {gpu.known_block_size = array<i32: 8, 12, 16>,
-       gpu.known_grid_size = array<i32: 20, 24, 28>} {
+  gpu.func @gpu_index_ops_range(%place: memref<i32>)  kernel attributes
+      {known_block_size = array<i32: 8, 12, 16>,
+       known_grid_size = array<i32: 20, 24, 28>} {
 
     // CHECK: rocdl.workitem.id.x {range = array<i32: 0, 8>} : i32
     %tIdX = gpu.thread_id x
@@ -92,8 +91,15 @@ gpu.module @test_module {
     // CHECK: rocdl.workgroup.id.z {range = array<i32: 0, 28>} : i32
     %bIdZ = gpu.block_id z
 
-    func.return %tIdX, %tIdY, %tIdZ, %bIdX, %bIdY, %bIdZ
-        : index, index, index, index, index, index
+    // "Usage" to make the ID calls not die
+    %0 = arith.addi %tIdX, %tIdY : index
+    %1 = arith.addi %0, %tIdZ : index
+    %2 = arith.addi %1, %bIdX : index
+    %3 = arith.addi %2, %bIdY : index
+    %4 = arith.addi %3, %bIdZ : index
+    %5 = arith.index_cast %4 : index to i32
+    memref.store %5, %place[] : memref<i32>
+    gpu.return
   }
 }
 

diff  --git a/mlir/test/Dialect/GPU/int-range-interface.mlir b/mlir/test/Dialect/GPU/int-range-interface.mlir
index a0917a2fdf110..1613f83b17bde 100644
--- a/mlir/test/Dialect/GPU/int-range-interface.mlir
+++ b/mlir/test/Dialect/GPU/int-range-interface.mlir
@@ -138,8 +138,8 @@ module attributes {gpu.container_module} {
 module attributes {gpu.container_module} {
   gpu.module @gpu_module {
     gpu.func @annotated_kernel() kernel
-      attributes {gpu.known_block_size = array<i32: 8, 12, 16>,
-          gpu.known_grid_size = array<i32: 20, 24, 28>} {
+      attributes {known_block_size = array<i32: 8, 12, 16>,
+          known_grid_size = array<i32: 20, 24, 28>} {
 
       %grid_dim_x = gpu.grid_dim x
       %grid_dim_y = gpu.grid_dim y
@@ -215,3 +215,117 @@ module attributes {gpu.container_module} {
   }
 }
 
+// -----
+
+// CHECK-LABEL: func @annotated_kernel
+module {
+  func.func @annotated_kernel()
+    attributes {gpu.known_block_size = array<i32: 8, 12, 16>,
+        gpu.known_grid_size = array<i32: 20, 24, 28>} {
+
+    %block_id_x = gpu.block_id x
+    %block_id_y = gpu.block_id y
+    %block_id_z = gpu.block_id z
+
+    // CHECK: test.reflect_bounds {smax = 19 : index, smin = 0 : index, umax = 19 : index, umin = 0 : index}
+    // CHECK: test.reflect_bounds {smax = 23 : index, smin = 0 : index, umax = 23 : index, umin = 0 : index}
+    // CHECK: test.reflect_bounds {smax = 27 : index, smin = 0 : index, umax = 27 : index, umin = 0 : index}
+    %block_id_x0 = test.reflect_bounds %block_id_x : index
+    %block_id_y0 = test.reflect_bounds %block_id_y : index
+    %block_id_z0 = test.reflect_bounds %block_id_z : index
+
+    %thread_id_x = gpu.thread_id x
+    %thread_id_y = gpu.thread_id y
+    %thread_id_z = gpu.thread_id z
+
+    // CHECK: test.reflect_bounds {smax = 7 : index, smin = 0 : index, umax = 7 : index, umin = 0 : index}
+    // CHECK: test.reflect_bounds {smax = 11 : index, smin = 0 : index, umax = 11 : index, umin = 0 : index}
+    // CHECK: test.reflect_bounds {smax = 15 : index, smin = 0 : index, umax = 15 : index, umin = 0 : index}
+    %thread_id_x0 = test.reflect_bounds %thread_id_x : index
+    %thread_id_y0 = test.reflect_bounds %thread_id_y : index
+    %thread_id_z0 = test.reflect_bounds %thread_id_z : index
+
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @local_bounds_kernel
+module attributes {gpu.container_module} {
+  gpu.module @gpu_module {
+    gpu.func @local_bounds_kernel() kernel {
+
+      %grid_dim_x = gpu.grid_dim x upper_bound 20
+      %grid_dim_y = gpu.grid_dim y upper_bound 24
+      %grid_dim_z = gpu.grid_dim z upper_bound 28
+
+      // CHECK: test.reflect_bounds {smax = 20 : index, smin = 1 : index, umax = 20 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 24 : index, smin = 1 : index, umax = 24 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 28 : index, smin = 1 : index, umax = 28 : index, umin = 1 : index}
+      %grid_dim_x0 = test.reflect_bounds %grid_dim_x : index
+      %grid_dim_y0 = test.reflect_bounds %grid_dim_y : index
+      %grid_dim_z0 = test.reflect_bounds %grid_dim_z : index
+
+      %block_id_x = gpu.block_id x upper_bound 20
+      %block_id_y = gpu.block_id y upper_bound 24
+      %block_id_z = gpu.block_id z upper_bound 28
+
+      // CHECK: test.reflect_bounds {smax = 19 : index, smin = 0 : index, umax = 19 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 23 : index, smin = 0 : index, umax = 23 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 27 : index, smin = 0 : index, umax = 27 : index, umin = 0 : index}
+      %block_id_x0 = test.reflect_bounds %block_id_x : index
+      %block_id_y0 = test.reflect_bounds %block_id_y : index
+      %block_id_z0 = test.reflect_bounds %block_id_z : index
+
+      %block_dim_x = gpu.block_dim x upper_bound 8
+      %block_dim_y = gpu.block_dim y upper_bound 12
+      %block_dim_z = gpu.block_dim z upper_bound 16
+
+      // CHECK: test.reflect_bounds {smax = 8 : index, smin = 1 : index, umax = 8 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 12 : index, smin = 1 : index, umax = 12 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 16 : index, smin = 1 : index, umax = 16 : index, umin = 1 : index}
+      %block_dim_x0 = test.reflect_bounds %block_dim_x : index
+      %block_dim_y0 = test.reflect_bounds %block_dim_y : index
+      %block_dim_z0 = test.reflect_bounds %block_dim_z : index
+
+      %thread_id_x = gpu.thread_id x upper_bound 8
+      %thread_id_y = gpu.thread_id y upper_bound 12
+      %thread_id_z = gpu.thread_id z upper_bound 16
+
+      // CHECK: test.reflect_bounds {smax = 7 : index, smin = 0 : index, umax = 7 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 11 : index, smin = 0 : index, umax = 11 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 15 : index, smin = 0 : index, umax = 15 : index, umin = 0 : index}
+      %thread_id_x0 = test.reflect_bounds %thread_id_x : index
+      %thread_id_y0 = test.reflect_bounds %thread_id_y : index
+      %thread_id_z0 = test.reflect_bounds %thread_id_z : index
+
+      %global_id_x = gpu.global_id x upper_bound 160
+      %global_id_y = gpu.global_id y upper_bound 288
+      %global_id_z = gpu.global_id z upper_bound 448
+
+      // CHECK: test.reflect_bounds {smax = 159 : index, smin = 0 : index, umax = 159 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 287 : index, smin = 0 : index, umax = 287 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 447 : index, smin = 0 : index, umax = 447 : index, umin = 0 : index}
+      %global_id_x0 = test.reflect_bounds %global_id_x : index
+      %global_id_y0 = test.reflect_bounds %global_id_y : index
+      %global_id_z0 = test.reflect_bounds %global_id_z : index
+
+      %subgroup_size = gpu.subgroup_size upper_bound 32 : index
+      %subgroup_id = gpu.subgroup_id upper_bound 32 : index
+      %num_subgroups = gpu.num_subgroups upper_bound 8 : index
+      %lane_id = gpu.lane_id upper_bound 64
+
+      // CHECK: test.reflect_bounds {smax = 32 : index, smin = 1 : index, umax = 32 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 31 : index, smin = 0 : index, umax = 31 : index, umin = 0 : index}
+      // CHECK: test.reflect_bounds {smax = 8 : index, smin = 1 : index, umax = 8 : index, umin = 1 : index}
+      // CHECK: test.reflect_bounds {smax = 63 : index, smin = 0 : index, umax = 63 : index, umin = 0 : index}
+      %subgroup_size0 = test.reflect_bounds %subgroup_size : index
+      %subgroup_id0 = test.reflect_bounds %subgroup_id : index
+      %num_subgroups0 = test.reflect_bounds %num_subgroups : index
+      %lane_id0 = test.reflect_bounds %lane_id : index
+
+      gpu.return
+    }
+  }
+}

diff  --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index 273bc282b0b3b..e9d8f329be8ed 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -704,11 +704,9 @@ func.func @alloc() {
 // -----
 
 module attributes {gpu.container_module} {
-  gpu.module @kernel {
-    // expected-error at +1 {{'gpu.func' op gpu.known_block_size must be a dense i32 array}}
-    gpu.func @kernel() kernel attributes {gpu.known_block_size = 32 : i32} {
-      gpu.return
-    }
+  // expected-error at +1 {{'func.func' op gpu.known_block_size must be a dense i32 array}}
+  func.func @kernel() attributes {gpu.known_block_size = 32 : i32} {
+    func.return
   }
 }
 
@@ -716,8 +714,8 @@ module attributes {gpu.container_module} {
 
 module attributes {gpu.container_module} {
   gpu.module @kernel {
-    // expected-error at +1 {{'gpu.func' op gpu.known_block_size must contain exactly 3 elements}}
-    gpu.func @kernel() kernel attributes {gpu.known_block_size = array<i32: 2, 1>} {
+    // expected-error at +1 {{'gpu.func' op attribute 'known_block_size' failed to satisfy constraint: i32 dense array attribute with 3 elements (if present)}}
+    gpu.func @kernel() kernel attributes {known_block_size = array<i32: 2, 1>} {
       gpu.return
     }
   }
@@ -725,6 +723,15 @@ module attributes {gpu.container_module} {
 
 // -----
 
+module {
+  // expected-error at +1 {{'func.func' op gpu.known_block_size must contain exactly 3 elements}}
+  func.func @kernel() attributes {gpu.known_block_size = array<i32: 2, 1>} {
+    func.return
+  }
+}
+
+// -----
+
 module {
   // expected-error @+1 {{'gpu.module' op attribute 'targets' failed to satisfy constraint: array of GPU target attributes with at least 1 elements}}
   gpu.module @gpu_funcs [] {

diff  --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 47ebe326b5d12..7f44f11b47e06 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -40,8 +40,8 @@ func.func @launch() {
 // CHECK-LABEL: gpu.module @launch_kernel
 // CHECK-NEXT: gpu.func @launch_kernel
 // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
-// CHECK-SAME: gpu.known_block_size = array<i32: 20, 24, 28>
-// CHECK-SAME: gpu.known_grid_size = array<i32: 8, 12, 16>
+// CHECK-SAME: known_block_size = array<i32: 20, 24, 28>
+// CHECK-SAME: known_grid_size = array<i32: 8, 12, 16>
 // CHECK-NEXT: %[[BID:.*]] = gpu.block_id x
 // CHECK-NEXT: = gpu.block_id y
 // CHECK-NEXT: = gpu.block_id z
@@ -108,8 +108,8 @@ llvm.func @launch_from_llvm_func() {
   // CHECK-NEXT: llvm.return
 
   // CHECK: gpu.func {{.*}} kernel attributes
-  // CHECK-SAME: gpu.known_block_size = array<i32: 1, 1, 1>
-  // CHECK-SAME: gpu.known_grid_size = array<i32: 1, 1, 1>
+  // CHECK-SAME: known_block_size = array<i32: 1, 1, 1>
+  // CHECK-SAME: known_grid_size = array<i32: 1, 1, 1>
   // CHECK: gpu.return
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %dim, %grid_y = %dim,
                                        %grid_z = %dim)
@@ -362,8 +362,8 @@ func.func @recursive_device_function() {
 
 // CHECK-LABEL: @non_constant_launches
 func.func @non_constant_launches(%arg0 : index) {
-  // CHECK-NOT: gpu.known_block_size
-  // CHECK-NOT: gpu.known_grid_size
+  // CHECK-NOT: known_block_size
+  // CHECK-NOT: known_grid_size
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %arg0, %grid_y = %arg0,
                                        %grid_z = %arg0)
              threads(%tx, %ty, %tz) in (%block_x = %arg0, %block_y = %arg0,
@@ -484,8 +484,8 @@ func.func @launch_cluster() {
 // CHECK-LABEL: gpu.module @launch_cluster_kernel
 // CHECK-NEXT: gpu.func @launch_cluster_kernel
 // CHECK-SAME: (%[[KERNEL_ARG0:.*]]: f32, %[[KERNEL_ARG1:.*]]: memref<?xf32, 1>)
-// CHECK-SAME: gpu.known_block_size = array<i32: 20, 24, 28>
-// CHECK-SAME: gpu.known_grid_size = array<i32: 8, 12, 16>
+// CHECK-SAME: known_block_size = array<i32: 20, 24, 28>
+// CHECK-SAME: known_grid_size = array<i32: 8, 12, 16>
 // CHECK-NEXT: %[[BID:.*]] = gpu.block_id x
 // CHECK-NEXT: = gpu.block_id y
 // CHECK-NEXT: = gpu.block_id z