[Mlir-commits] [mlir] 803828f - [mlir][GPU] Refactor, improve constant size information handling (#186907)

Tue Mar 17 12:03:54 PDT 2026

Author: Krzysztof Drewniak
Date: 2026-03-17T12:03:49-07:00
New Revision: 803828f4aa6285bc9c7f27f7d33de5713085ff3b

URL: https://github.com/llvm/llvm-project/commit/803828f4aa6285bc9c7f27f7d33de5713085ff3b
DIFF: https://github.com/llvm/llvm-project/commit/803828f4aa6285bc9c7f27f7d33de5713085ff3b.diff

LOG: [mlir][GPU] Refactor, improve constant size information handling (#186907)

1. There was duplicate code between the integer range analysis's
handling of static dimension size information (ex. gpu.known_block_dim
attributes) and the handling during the lowering of those operations.
The code from integer range analysis was given a dialect-wide entry
point (and had its types fixed to be more accurate), which the lowering
templates now call.
2. The templated lowering for block/grid/cluster_dim now produces
precise ranges (indicating the constant value) where one is known, and
the lowerings in rocdl (including those for subgroup_id) have been fixed
appropriately.
3. While I was here, the gpu.dimension enum has been moved to GPUBase so
it lives next to the other enums.
4. The pattern that expands subgroup_id operations now adds any thread
dimension bounds it finds in context.

(Claude was used for an initial round of review, I did the main coding
myself.)

---------

Co-authored-by: Claude Opus 4.6 <noreply at anthropic.com>

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
    mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
    mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
    mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
    mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
    mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
    mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
    mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp
    mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
    mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
    mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
    mlir/test/Dialect/GPU/subgroupId-rewrite.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index 1a07d506b56d7..55326f044147b 100644

--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -101,6 +101,27 @@ def GPU_AddressSpaceAttr :
 
 def GPU_AddressSpaceAttrArray : TypedArrayAttrBase<GPU_AddressSpaceAttr, "GPU Address Space array">;
 
+def GPU_Dimension : GPU_I32Enum<"Dimension",
+    "a dimension, either 'x', 'y', or 'z'",
+    [
+      I32EnumAttrCase<"x", 0>,
+      I32EnumAttrCase<"y", 1>,
+      I32EnumAttrCase<"z", 2>
+    ]>;
+
+def GPU_DimensionAttr : EnumAttr<GPU_Dialect, GPU_Dimension, "dim">;
+
+def GPU_DimensionKind : I32Enum<"DimensionKind",
+  "the possible kinds of launch dimension",
+  [
+    I32EnumCase<"Other", 0, "other">,
+    I32EnumCase<"Block", 1, "block">,
+    I32EnumCase<"Grid", 2, "grid">,
+    I32EnumCase<"Cluster", 3, "cluster">
+  ]> {
+  let cppNamespace = "::mlir::gpu";
+}
+
 //===----------------------------------------------------------------------===//
 // GPU Types.
 //===----------------------------------------------------------------------===//

diff  --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
index a9886d1f21ca0..6e9cf709c7585 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -221,4 +221,12 @@ class SparseSpGEMMOpHandleType
 #define GET_OP_CLASSES
 #include "mlir/Dialect/GPU/IR/GPUOps.h.inc"
 
+namespace mlir::gpu {
+/// Retrieve the constant bounds for a given dimension and dimension kind
+/// from the context surrounding `op`, if known, and return them. This will
+/// check the bounds on an enclosing `gpu.launch`, an enclosing `gpu.func`, and
+/// any `gpu.known_*_size` on other function-like operations, in that order.
+std::optional<uint32_t>
+getKnownDimensionSizeAround(Operation *op, DimensionKind kind, Dimension dim);
+} // namespace mlir::gpu
 #endif // MLIR_DIALECT_GPU_IR_GPUDIALECT_H

diff  --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index b5a9e3413ddfd..36e0875f53b0a 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -38,18 +38,6 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 class GPU_Op<string mnemonic, list<Trait> traits = []> :
     Op<GPU_Dialect, mnemonic, traits>;
 
-def GPU_Dimension : I32EnumAttr<"Dimension",
-    "a dimension, either 'x', 'y', or 'z'",
-    [
-      I32EnumAttrCase<"x", 0>,
-      I32EnumAttrCase<"y", 1>,
-      I32EnumAttrCase<"z", 2>
-    ]>{
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::gpu";
-}
-def GPU_DimensionAttr : EnumAttr<GPU_Dialect, GPU_Dimension, "dim">;
-
 class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
     GPU_Op<mnemonic, !listconcat(traits, [
         Pure,

diff  --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
index ad3ae74d0c683..c27236415abb1 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.cpp
@@ -22,53 +22,17 @@ LLVM::ConstantRangeAttr mlir::gpu::index_lowering::getIndexOpRange(
   // 2. Inherent attributes on a surrounding gpu.func
   // 3. Discardable attributes on a surrounding function of any kind
   // The below code handles these in reverse order so that more important
-  // sources overwrite less important ones.
-  DenseI32ArrayAttr funcBounds = nullptr;
-  if (auto funcOp = op->getParentOfType<FunctionOpInterface>()) {
-    switch (indexKind) {
-    case IndexKind::Block: {
-      auto blockHelper =
-          gpu::GPUDialect::KnownBlockSizeAttrHelper(op->getContext());
-      if (blockHelper.isAttrPresent(funcOp))
-        funcBounds = blockHelper.getAttr(funcOp);
-      break;
-    }
-    case IndexKind::Grid: {
-      auto gridHelper =
-          gpu::GPUDialect::KnownGridSizeAttrHelper(op->getContext());
-      if (gridHelper.isAttrPresent(funcOp))
-        funcBounds = gridHelper.getAttr(funcOp);
-      break;
-    }
-    case IndexKind::Cluster: {
-      auto clusterHelper =
-          gpu::GPUDialect::KnownClusterSizeAttrHelper(op->getContext());
-      if (clusterHelper.isAttrPresent(funcOp))
-        funcBounds = clusterHelper.getAttr(funcOp);
-      break;
-    }
-    case IndexKind::Other:
-      break;
-    }
-  }
-  if (auto gpuFunc = op->getParentOfType<gpu::GPUFuncOp>()) {
-    switch (indexKind) {
-    case IndexKind::Block:
-      funcBounds = gpuFunc.getKnownBlockSizeAttr();
-      break;
-    case IndexKind::Grid:
-      funcBounds = gpuFunc.getKnownGridSizeAttr();
-      break;
-    case IndexKind::Cluster:
-      funcBounds = gpuFunc.getKnownClusterSizeAttr();
-      break;
-    case IndexKind::Other:
-      break;
-    }
-  }
-  std::optional<uint32_t> upperBound;
-  if (funcBounds)
-    upperBound = funcBounds.asArrayRef()[static_cast<uint32_t>(dim)];
+  // sources overwrite less important ones. As an exception, dimension-size
+  // getters will return exact bounds if known.
+  std::optional<uint32_t> upperBound =
+      getKnownDimensionSizeAround(op, indexKind, dim);
+  // If our upper bound is the maximum possible value, we can't easily construct
+  // the constant range for it.
+  if (upperBound && intrType == IntrType::Dim &&
+      *upperBound < std::numeric_limits<uint32_t>::max())
+    return LLVM::ConstantRangeAttr::get(op->getContext(), bitWidth, *upperBound,
+                                        *upperBound + 1);
+
   if (opUpperBound)
     upperBound = *opUpperBound;
 

diff  --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index 186823e1b40ca..777f3b92b941c 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -16,7 +16,9 @@
 namespace mlir {
 namespace gpu {
 namespace index_lowering {
-enum class IndexKind : uint32_t { Other = 0, Block = 1, Grid = 2, Cluster = 3 };
+// Alias so existing call sites don't need updating.
+using IndexKind = gpu::DimensionKind;
+
 enum class IntrType : uint32_t {
   None = 0,
   Id = 1,
@@ -27,7 +29,9 @@ enum class IntrType : uint32_t {
 /// are found. `bitWidth` controls the width of the returned range.
 /// Checks the provided upper_bound from the op (highest priority), inherent
 /// attrs on enclosing `gpu.func`s, and discardable attributes on other
-/// enclosing function ops (lowest priority).
+/// enclosing function ops (lowest priority). However, in the case where
+/// a dimension is known to have a constant value, returns a range indicating
+/// that value.
 LLVM::ConstantRangeAttr getIndexOpRange(Operation *op, gpu::Dimension dim,
                                         std::optional<uint32_t> opUpperBound,
                                         IndexKind indexKind, IntrType intrType,

diff  --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index b1a4627977f80..e08ec138c853a 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -106,19 +106,24 @@ static Value getLaneId(RewriterBase &rewriter, Location loc) {
 static constexpr int64_t kMaxThreadsPerBlockDim = 1024;
 
 /// Emits a call to an OCKL block/grid size function corresponding to
-/// `indexKind` with argument `dim`, querying for upper bounds in the context
-/// surrounding `contextOp` as a fallback for an unknown/unavailable
-/// `opUpperBound`.
-static Value getOcklDim(RewriterBase &rewriter,
-                        gpu::index_lowering::IndexKind indexKind,
-                        gpu::Dimension dim, Operation *contextOp,
-                        std::optional<uint32_t> opUpperBound) {
+/// `indexKind` with argument `dim`, except that if the context around
+/// `contextOp` gives an exact size for that dimension, return that as
+/// an `i64` constant instead.
+static Value getKnownOrOcklDim(RewriterBase &rewriter,
+                               gpu::index_lowering::IndexKind indexKind,
+                               gpu::Dimension dim, Operation *contextOp,
+                               std::optional<uint32_t> opUpperBound) {
   Location loc = contextOp->getLoc();
   MLIRContext *context = contextOp->getContext();
 
   auto i32Ty = IntegerType::get(context, 32);
   auto i64Ty = IntegerType::get(context, 64);
 
+  if (std::optional<uint32_t> knownDim =
+          gpu::getKnownDimensionSizeAround(contextOp, indexKind, dim))
+    return LLVM::ConstantOp::create(rewriter, loc,
+                                    rewriter.getI64IntegerAttr(*knownDim));
+
   int32_t dimParam = static_cast<int32_t>(dim);
 
   StringRef functionName;
@@ -145,14 +150,16 @@ static Value getOcklDim(RewriterBase &rewriter,
   auto callOp =
       LLVM::CallOp::create(rewriter, loc, funcOp, ValueRange{dimConst});
 
-  // Set range attribute on the call result if bounds are available.
-  auto range = gpu::index_lowering::getIndexOpRange(
-      contextOp, dim, opUpperBound, indexKind,
-      gpu::index_lowering::IntrType::Dim, /*bitWidth=*/64);
-  // Fall back to the hardware limit for block dimensions.
-  if (!range && indexKind == gpu::index_lowering::IndexKind::Block)
+  LLVM::ConstantRangeAttr range;
+  if (opUpperBound) {
+    range = LLVM::ConstantRangeAttr::get(
+        context, APInt(64, 1),
+        APInt(64, static_cast<uint64_t>(*opUpperBound) + 1));
+  } else if (indexKind == gpu::index_lowering::IndexKind::Block) {
+    // Set the hardware limit for block ranges as the bounds on block dim calls.
     range = LLVM::ConstantRangeAttr::get(context, APInt(64, 1),
                                          APInt(64, kMaxThreadsPerBlockDim + 1));
+  }
   if (range) {
     callOp.setResAttrsAttr(rewriter.getArrayAttr(rewriter.getDictionaryAttr(
         rewriter.getNamedAttr(LLVM::LLVMDialect::getRangeAttrName(), range))));
@@ -186,8 +193,8 @@ struct GPUDimOpToOcklCall final : ConvertOpToLLVMPattern<OpTy> {
     if (auto bound = op.getUpperBound())
       opUpperBound = static_cast<uint32_t>(bound->getZExtValue());
 
-    Value ocklCall =
-        getOcklDim(rewriter, indexKind, op.getDimension(), op, opUpperBound);
+    Value ocklCall = getKnownOrOcklDim(rewriter, indexKind, op.getDimension(),
+                                       op, opUpperBound);
     Value result = truncOrExtToLLVMType(rewriter, loc, ocklCall,
                                         *this->getTypeConverter());
     rewriter.replaceOp(op, result);
@@ -305,8 +312,8 @@ struct GPUSubgroupIdOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
 
       auto getBlockDim = [&](gpu::Dimension dim) {
         Value dim64 =
-            getOcklDim(rewriter, gpu::index_lowering::IndexKind::Block, dim, op,
-                       std::nullopt);
+            getKnownOrOcklDim(rewriter, gpu::index_lowering::IndexKind::Block,
+                              dim, op, std::nullopt);
         Value dimTrunc =
             LLVM::TruncOp::create(rewriter, loc, int32Type, dim64, flags);
         return dimTrunc;

diff  --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index 263fcb96c17db..5eb8aa2fe0923 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -10,7 +10,6 @@
 #include "mlir/IR/Matchers.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Interfaces/InferIntRangeInterface.h"
-#include "llvm/Support/ErrorHandling.h"
 #include <optional>
 
 using namespace mlir;
@@ -18,7 +17,7 @@ using namespace mlir::gpu;
 
 // Maximum grid and block dimensions of all known GPUs are less than 2^32.
 static constexpr uint64_t kMaxDim = std::numeric_limits<uint32_t>::max();
-// Maximum cluster size
+// Maximum cluster size.
 static constexpr uint64_t kMaxClusterDim = 8;
 // Maximum subgroups are no larger than 128.
 static constexpr uint64_t kMaxSubgroupSize = 128;
@@ -29,15 +28,7 @@ static ConstantIntRanges getIndexRange(uint64_t umin, uint64_t umax) {
                                          APInt(width, umax));
 }
 
-namespace {
-enum class LaunchDims : uint32_t { Block = 0, Grid = 1, Cluster = 2 };
-} // end namespace
-
-/// If the operation `op` is in a context that is annotated with maximum
-/// launch dimensions (a launch op with constant block or grid
-/// sizes or a launch_func op with the appropriate dimensions), return
-/// the bound on the maximum size of the dimension that the op is querying.
-/// IDs will be one less than this bound.
+static uint64_t zext(uint32_t arg) { return static_cast<uint64_t>(arg); }
 
 static Value valueByDim(KernelDim3 dims, Dimension dim) {
   switch (dim) {
@@ -51,53 +42,55 @@ static Value valueByDim(KernelDim3 dims, Dimension dim) {
   llvm_unreachable("All dimension enum cases handled above");
 }
 
-static uint64_t zext(uint32_t arg) { return static_cast<uint64_t>(arg); }
-
-static std::optional<uint64_t>
-getKnownLaunchAttr(GPUFuncOp func, LaunchDims dims, Dimension dim) {
+static std::optional<uint32_t>
+getKnownLaunchAttr(GPUFuncOp func, DimensionKind dims, Dimension dim) {
   DenseI32ArrayAttr bounds;
   switch (dims) {
-  case LaunchDims::Block:
+  case DimensionKind::Other:
+    return std::nullopt;
+  case DimensionKind::Block:
     bounds = func.getKnownBlockSizeAttr();
     break;
-  case LaunchDims::Grid:
+  case DimensionKind::Grid:
     bounds = func.getKnownGridSizeAttr();
     break;
-  case LaunchDims::Cluster:
+  case DimensionKind::Cluster:
     bounds = func.getKnownClusterSizeAttr();
     break;
   }
   if (!bounds)
     return std::nullopt;
-  if (bounds.size() < static_cast<uint32_t>(dim))
+  if (bounds.size() <= static_cast<uint32_t>(dim))
     return std::nullopt;
-  return zext(bounds[static_cast<uint32_t>(dim)]);
+  return bounds[static_cast<uint32_t>(dim)];
 }
 
-static std::optional<uint64_t> getKnownLaunchAttr(FunctionOpInterface func,
+static std::optional<uint32_t> getKnownLaunchAttr(FunctionOpInterface func,
                                                   StringRef attrName,
                                                   Dimension dim) {
   auto bounds = func.getOperation()->getAttrOfType<DenseI32ArrayAttr>(attrName);
   if (!bounds)
     return std::nullopt;
-  if (bounds.size() < static_cast<uint32_t>(dim))
+  if (bounds.size() <= static_cast<uint32_t>(dim))
     return std::nullopt;
-  return zext(bounds[static_cast<uint32_t>(dim)]);
+  return bounds[static_cast<uint32_t>(dim)];
 }
 
-template <typename Op>
-static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
-  Dimension dim = op.getDimension();
-  if (auto launch = op->template getParentOfType<LaunchOp>()) {
+std::optional<uint32_t>
+mlir::gpu::getKnownDimensionSizeAround(Operation *op, DimensionKind kind,
+                                       Dimension dim) {
+  if (auto launch = op->getParentOfType<LaunchOp>()) {
     KernelDim3 bounds;
-    switch (type) {
-    case LaunchDims::Block:
+    switch (kind) {
+    case DimensionKind::Other:
+      return std::nullopt;
+    case DimensionKind::Block:
       bounds = launch.getBlockSizeOperandValues();
       break;
-    case LaunchDims::Grid:
+    case DimensionKind::Grid:
       bounds = launch.getGridSizeOperandValues();
       break;
-    case LaunchDims::Cluster:
+    case DimensionKind::Cluster:
       if (launch.hasClusterSize()) {
         auto clusterBounds = launch.getClusterSizeOperandValues();
         if (clusterBounds)
@@ -107,25 +100,27 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
     }
     Value maybeBound = valueByDim(bounds, dim);
     APInt value;
-    if (matchPattern(maybeBound, m_ConstantInt(&value)))
+    if (maybeBound && matchPattern(maybeBound, m_ConstantInt(&value)))
       return value.getZExtValue();
   }
 
-  if (auto gpuFunc = op->template getParentOfType<GPUFuncOp>()) {
-    auto inherentAttr = getKnownLaunchAttr(gpuFunc, type, dim);
+  if (auto gpuFunc = op->getParentOfType<GPUFuncOp>()) {
+    auto inherentAttr = getKnownLaunchAttr(gpuFunc, kind, dim);
     if (inherentAttr)
       return inherentAttr;
   }
-  if (auto func = op->template getParentOfType<FunctionOpInterface>()) {
+  if (auto func = op->getParentOfType<FunctionOpInterface>()) {
     StringRef attrName;
-    switch (type) {
-    case LaunchDims::Block:
+    switch (kind) {
+    case DimensionKind::Other:
+      return std::nullopt;
+    case DimensionKind::Block:
       attrName = GPUDialect::KnownBlockSizeAttrHelper::getNameStr();
       break;
-    case LaunchDims::Grid:
+    case DimensionKind::Grid:
       attrName = GPUDialect::KnownGridSizeAttrHelper::getNameStr();
       break;
-    case LaunchDims::Cluster:
+    case DimensionKind::Cluster:
       attrName = GPUDialect::KnownClusterSizeAttrHelper::getNameStr();
       break;
     }
@@ -146,8 +141,10 @@ void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 
 void ClusterDimBlocksOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                            SetIntRangeFn setResultRange) {
-  if (auto known = getKnownLaunchDim(*this, LaunchDims::Cluster))
-    return setResultRange(getResult(), getIndexRange(*known, *known));
+  if (auto known = getKnownDimensionSizeAround(*this, DimensionKind::Cluster,
+                                               getDimension()))
+    return setResultRange(getResult(),
+                          getIndexRange(zext(*known), zext(*known)));
 
   uint64_t max = kMaxClusterDim;
   if (auto specified = getUpperBound())
@@ -166,8 +163,9 @@ void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 void ClusterBlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                          SetIntRangeFn setResultRange) {
   uint64_t max = kMaxClusterDim;
-  if (auto known = getKnownLaunchDim(*this, LaunchDims::Cluster))
-    max = *known;
+  if (auto known = getKnownDimensionSizeAround(*this, DimensionKind::Cluster,
+                                               getDimension()))
+    max = zext(*known);
   if (auto specified = getUpperBound())
     max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
@@ -175,11 +173,12 @@ void ClusterBlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 
 void BlockDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                    SetIntRangeFn setResultRange) {
-  std::optional<uint64_t> knownVal =
-      getKnownLaunchDim(*this, LaunchDims::Block);
+  std::optional<uint32_t> knownVal =
+      getKnownDimensionSizeAround(*this, DimensionKind::Block, getDimension());
   if (knownVal)
-    return setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
-  ;
+    return setResultRange(getResult(),
+                          getIndexRange(zext(*knownVal), zext(*knownVal)));
+
   uint64_t max = kMaxDim;
   if (auto specified = getUpperBound())
     max = specified->getZExtValue();
@@ -189,8 +188,9 @@ void BlockDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 void BlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                   SetIntRangeFn setResultRange) {
   uint64_t max = kMaxDim;
-  if (auto fromContext = getKnownLaunchDim(*this, LaunchDims::Grid))
-    max = fromContext.value();
+  if (auto fromContext = getKnownDimensionSizeAround(*this, DimensionKind::Grid,
+                                                     getDimension()))
+    max = zext(*fromContext);
   if (auto specified = getUpperBound())
     max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
@@ -198,9 +198,11 @@ void BlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 
 void GridDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                   SetIntRangeFn setResultRange) {
-  std::optional<uint64_t> knownVal = getKnownLaunchDim(*this, LaunchDims::Grid);
+  std::optional<uint32_t> knownVal =
+      getKnownDimensionSizeAround(*this, DimensionKind::Grid, getDimension());
   if (knownVal)
-    return setResultRange(getResult(), getIndexRange(*knownVal, *knownVal));
+    return setResultRange(getResult(),
+                          getIndexRange(zext(*knownVal), zext(*knownVal)));
   uint64_t max = kMaxDim;
   if (auto specified = getUpperBound())
     max = specified->getZExtValue();
@@ -210,8 +212,9 @@ void GridDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 void ThreadIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                    SetIntRangeFn setResultRange) {
   uint64_t max = kMaxDim;
-  if (auto fromContext = getKnownLaunchDim(*this, LaunchDims::Block))
-    max = fromContext.value();
+  if (auto fromContext = getKnownDimensionSizeAround(
+          *this, DimensionKind::Block, getDimension()))
+    max = zext(*fromContext);
   if (auto specified = getUpperBound())
     max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
@@ -239,10 +242,12 @@ void GlobalIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
     return setResultRange(getResult(),
                           getIndexRange(0, specified->getZExtValue() - 1ULL));
 
-  uint64_t blockDimMax =
-      getKnownLaunchDim(*this, LaunchDims::Block).value_or(kMaxDim);
-  uint64_t gridDimMax =
-      getKnownLaunchDim(*this, LaunchDims::Grid).value_or(kMaxDim);
+  uint64_t blockDimMax = zext(
+      getKnownDimensionSizeAround(*this, DimensionKind::Block, getDimension())
+          .value_or(kMaxDim));
+  uint64_t gridDimMax = zext(
+      getKnownDimensionSizeAround(*this, DimensionKind::Grid, getDimension())
+          .value_or(kMaxDim));
   setResultRange(getResult(),
                  getIndexRange(0, (blockDimMax * gridDimMax) - 1ULL));
 }

diff  --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp
index ad6ee9e070d09..50dbc584a2c01 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupIdRewriter.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/Index/IR/IndexOps.h"
@@ -54,11 +55,39 @@ struct GpuSubgroupIdRewriter final : OpRewritePattern<gpu::SubgroupIdOp> {
     Location loc = op->getLoc();
     Type indexType = rewriter.getIndexType();
 
-    Value dimX = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::x);
-    Value dimY = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::y);
-    Value tidX = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x);
-    Value tidY = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::y);
-    Value tidZ = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::z);
+    auto asMaybeIndexAttr = [&](std::optional<uint32_t> bound) -> IntegerAttr {
+      if (!bound)
+        return IntegerAttr();
+      return IntegerAttr::get(
+          indexType, static_cast<int64_t>(static_cast<uint64_t>(*bound)));
+    };
+
+    IntegerAttr maybeKnownDimX =
+        asMaybeIndexAttr(gpu::getKnownDimensionSizeAround(
+            op, gpu::DimensionKind::Block, gpu::Dimension::x));
+    IntegerAttr maybeKnownDimY =
+        asMaybeIndexAttr(gpu::getKnownDimensionSizeAround(
+            op, gpu::DimensionKind::Block, gpu::Dimension::y));
+    IntegerAttr maybeKnownDimZ =
+        asMaybeIndexAttr(gpu::getKnownDimensionSizeAround(
+            op, gpu::DimensionKind::Block, gpu::Dimension::z));
+
+    Value dimX, dimY;
+    if (maybeKnownDimX)
+      dimX = arith::ConstantOp::create(rewriter, loc, maybeKnownDimX);
+    else
+      dimX = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::x);
+    if (maybeKnownDimY)
+      dimY = arith::ConstantOp::create(rewriter, loc, maybeKnownDimY);
+    else
+      dimY = gpu::BlockDimOp::create(rewriter, loc, gpu::Dimension::y);
+
+    Value tidX = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::x,
+                                         maybeKnownDimX);
+    Value tidY = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::y,
+                                         maybeKnownDimY);
+    Value tidZ = gpu::ThreadIdOp::create(rewriter, loc, gpu::Dimension::z,
+                                         maybeKnownDimZ);
 
     Value dimYxIdZ =
         arith::MulIOp::create(rewriter, loc, indexType, dimY, tidZ);

diff  --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 4837800488e86..929794f035b9f 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1179,11 +1179,11 @@ gpu.module @test_module_cluster_block_ops {
     %1 = gpu.cluster_block_id y
     // CHECK: nvvm.read.ptx.sreg.cluster.ctaid.z range <i32, 0, 2> : i32
     %2 = gpu.cluster_block_id z
-    // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.x range <i32, 1, 9> : i32
+    // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.x range <i32, 8, 9> : i32
     %3 = gpu.cluster_dim_blocks x
-    // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.y range <i32, 1, 5> : i32
+    // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.y range <i32, 4, 5> : i32
     %4 = gpu.cluster_dim_blocks y
-    // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.z range <i32, 1, 3> : i32
+    // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.z range <i32, 2, 3> : i32
     %5 = gpu.cluster_dim_blocks z
 
     %6 = arith.addi %0, %1 : index

diff  --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
index b44216aab3bff..9cab3ff48f5bf 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-subgroup-id.mlir
@@ -50,12 +50,9 @@ func.func @subgroup_id_with_workgroup_sizes() -> index
   // GFX9-DAG: %[[IDX:.+]] = rocdl.workitem.id.x range <i32, 0, 64> : i32
   // GFX9-DAG: %[[IDY:.+]] = rocdl.workitem.id.y range <i32, 0, 4> : i32
   // GFX9-DAG: %[[IDZ:.+]] = rocdl.workitem.id.z range <i32, 0, 1> : i32
-  // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C0:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 65>})
-  // Yes, this is checking after the call that uses it. This prevents collisions with other 0s.
-  // GFX9-DAG: %[[C0]] = llvm.mlir.constant(0 : i32) : i32
+  // GFX9-DAG: %[[DIMX_I64:.+]] = llvm.mlir.constant(64 : i64) : i64
   // GFX9-DAG: %[[DIMX:.+]] = llvm.trunc %[[DIMX_I64]] overflow<nsw, nuw> : i64 to i32
-  // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.call @__ockl_get_local_size(%[[C1:.+]]) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 5>})
-  // GFX9-DAG: %[[C1]] = llvm.mlir.constant(1 : i32) : i32
+  // GFX9-DAG: %[[DIMY_I64:.+]] = llvm.mlir.constant(4 : i64) : i64
   // GFX9-DAG: %[[DIMY:.+]] = llvm.trunc %[[DIMY_I64]] overflow<nsw, nuw> : i64 to i32
   // GFX9: %[[Z_DY:.+]] = llvm.mul %[[DIMY]], %[[IDZ]] overflow<nsw, nuw>
   // GFX9: %[[ZY:.+]] = llvm.add %[[IDY]], %[[Z_DY]] overflow<nsw, nuw>

diff  --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index 3cc9ded6fe916..5eaa2d0b4df28 100755
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -100,18 +100,18 @@ gpu.module @test_module {
     // CHECK: rocdl.workgroup.id.z range <i32, 0, 28> : i32
     %bIdZ = gpu.block_id z
 
-    // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 9>})
+    // CHECK: llvm.mlir.constant(8 : i64) : i64
     %bDimX = gpu.block_dim x
-    // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 13>})
+    // CHECK: llvm.mlir.constant(12 : i64) : i64
     %bDimY = gpu.block_dim y
-    // CHECK: llvm.call @__ockl_get_local_size(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 17>})
+    // CHECK: llvm.mlir.constant(16 : i64) : i64
     %bDimZ = gpu.block_dim z
 
-    // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 21>})
+    // CHECK: llvm.mlir.constant(20 : i64) : i64
     %gDimX = gpu.grid_dim x
-    // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 25>})
+    // CHECK: llvm.mlir.constant(24 : i64) : i64
     %gDimY = gpu.grid_dim y
-    // CHECK: llvm.call @__ockl_get_num_groups(%{{.*}}) : (i32) -> (i64 {llvm.range = #llvm.constant_range<i64, 1, 29>})
+    // CHECK: llvm.mlir.constant(28 : i64) : i64
     %gDimZ = gpu.grid_dim z
 
     // "Usage" to make the ID calls not die

diff  --git a/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir
index 0d4f4d590bb4e..23cc31baf73b1 100644
--- a/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir
+++ b/mlir/test/Dialect/GPU/subgroupId-rewrite.mlir
@@ -22,3 +22,29 @@ func.func @subgroupId(%sz : index, %mem: memref<index, 1>) {
   }
   return
 }
+
+// CHECK-LABEL: func.func @subgroupIdConsts
+// CHECK-SAME: (%[[SZ:.*]]: index, %[[MEM:.*]]: memref<index, 1>) {
+func.func @subgroupIdConsts(%sz : index, %mem: memref<index, 1>) {
+  %c32 = arith.constant 32 : index
+  %c4 = arith.constant 4 : index
+  %c2 = arith.constant 2 : index
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
+             threads(%tx, %ty, %tz) in (%block_x = %c32, %block_y = %c4, %block_z = %c2) {
+    // CHECK-DAG: %[[DIMX:.*]] = arith.constant 32 : index
+    // CHECK-DAG: %[[DIMY:.*]] = arith.constant 4 : index
+    // CHECK: %[[TIDX:.*]] = gpu.thread_id x upper_bound 32
+    // CHECK-NEXT: %[[TIDY:.*]] = gpu.thread_id y upper_bound 4
+    // CHECK-NEXT: %[[TIDZ:.*]] = gpu.thread_id z upper_bound 2
+    // CHECK-NEXT: %[[T0:.*]] = arith.muli %[[TIDZ]], %[[DIMY]] : index
+    // CHECK-NEXT: %[[T1:.*]] = arith.addi %[[T0]], %[[TIDY]] : index
+    // CHECK-NEXT: %[[T2:.*]] = arith.muli  %[[T1]], %[[DIMX]] : index
+    // CHECK-NEXT: %[[T3:.*]] = arith.addi %[[TIDX]], %[[T2]] : index
+    // CHECK-NEXT: %[[T4:.*]] = gpu.subgroup_size : index
+    // CHECK-NEXT: %[[T5:.*]] = arith.divui %[[T3]], %[[T4]] : index
+    %idz = gpu.subgroup_id : index
+    memref.store %idz, %mem[] : memref<index, 1>
+    gpu.terminator
+  }
+  return
+}