[Mlir-commits] [mlir] [MLIR] Propagate known cluster sizes from gpu.launch to gpu.func (PR #174404)

Mon Jan 5 05:47:10 PST 2026

https://github.com/apaszke updated https://github.com/llvm/llvm-project/pull/174404

>From 4a77a2ad0c78b8f034f39c9e63c8d0c3d6003684 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke at gmail.com>
Date: Mon, 5 Jan 2026 13:31:42 +0000
Subject: [PATCH] Propagate known cluster sizes from gpu.launch to gpu.func

This lets us properly annotate ranges for gpu.cluster_block_id and gpu.cluster_dim_blocks.
It also allows us to fill in the nvvm.cluster_dim attribute for use in the NVVM backend.
---
 mlir/include/mlir/Dialect/GPU/IR/GPUBase.td   |  4 +-
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    |  3 +-
 .../Conversion/GPUCommon/GPUOpsLowering.cpp   | 12 +++++-
 .../lib/Conversion/GPUCommon/GPUOpsLowering.h |  7 ++++
 .../GPUCommon/IndexIntrinsicsOpLowering.h     | 12 +++++-
 .../Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp  |  3 +-
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  8 ++--
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      |  3 +-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        |  2 +
 .../GPU/IR/InferIntRangeInterfaceImpls.cpp    | 20 ++++++++-
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     | 42 +++++++++++++++++++
 .../GPU/int-range-interface-cluster.mlir      | 27 ++++++++++++
 12 files changed, 132 insertions(+), 11 deletions(-)
 create mode 100644 mlir/test/Dialect/GPU/int-range-interface-cluster.mlir

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
index 2c29bb8a01a41..f0086158fb9b6 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
@@ -61,10 +61,10 @@ def GPU_Dialect : Dialect {
     /// attribute with value 'workgroup`.
     static bool isWorkgroupMemoryAddressSpace(Attribute memorySpace);
   }];
-
   let discardableAttrs = (ins
     "::mlir::DenseI32ArrayAttr":$known_block_size,
-    "::mlir::DenseI32ArrayAttr":$known_grid_size
+    "::mlir::DenseI32ArrayAttr":$known_grid_size,
+    "::mlir::DenseI32ArrayAttr":$known_cluster_size
   );
 
   let dependentDialects = ["arith::ArithDialect"];
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 4884541a60535..e8c23200547d6 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -431,7 +431,8 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
                        OptionalAttr<DictArrayAttr>:$workgroup_attrib_attrs,
                        OptionalAttr<DictArrayAttr>:$private_attrib_attrs,
                        GPU_OptionalDimSizeHintAttr:$known_block_size,
-                       GPU_OptionalDimSizeHintAttr:$known_grid_size);
+                       GPU_OptionalDimSizeHintAttr:$known_grid_size,
+                       GPU_OptionalDimSizeHintAttr:$known_cluster_size);
   let regions = (region AnyRegion:$body);
 
   let skipDefaultBuilders = 1;
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index eb662a1b056de..498bea0fd17b4 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -186,7 +186,8 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
         attr.getName() == gpuFuncOp.getWorkgroupAttribAttrsAttrName() ||
         attr.getName() == gpuFuncOp.getPrivateAttribAttrsAttrName() ||
         attr.getName() == gpuFuncOp.getKnownBlockSizeAttrName() ||
-        attr.getName() == gpuFuncOp.getKnownGridSizeAttrName())
+        attr.getName() == gpuFuncOp.getKnownGridSizeAttrName() ||
+        attr.getName() == gpuFuncOp.getKnownClusterSizeAttrName())
       continue;
     if (attr.getName() == gpuFuncOp.getArgAttrsAttrName()) {
       argAttrs = gpuFuncOp.getArgAttrsAttr();
@@ -197,6 +198,7 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
 
   DenseI32ArrayAttr knownBlockSize = gpuFuncOp.getKnownBlockSizeAttr();
   DenseI32ArrayAttr knownGridSize = gpuFuncOp.getKnownGridSizeAttr();
+  DenseI32ArrayAttr knownClusterSize = gpuFuncOp.getKnownClusterSizeAttr();
   // Ensure we don't lose information if the function is lowered before its
   // surrounding context.
   auto *gpuDialect = cast<gpu::GPUDialect>(gpuFuncOp->getDialect());
@@ -206,6 +208,10 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
   if (knownGridSize)
     attributes.emplace_back(gpuDialect->getKnownGridSizeAttrHelper().getName(),
                             knownGridSize);
+  if (knownClusterSize)
+    attributes.emplace_back(
+        gpuDialect->getKnownClusterSizeAttrHelper().getName(),
+        knownClusterSize);
 
   // Add a dialect specific kernel attribute in addition to GPU kernel
   // attribute. The former is necessary for further translation while the
@@ -217,6 +223,10 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
     if (kernelBlockSizeAttributeName && knownBlockSize) {
       attributes.emplace_back(kernelBlockSizeAttributeName, knownBlockSize);
     }
+    // Set the dialect-specific cluster size attribute if there is one.
+    if (kernelClusterSizeAttributeName && knownClusterSize) {
+      attributes.emplace_back(kernelClusterSizeAttributeName, knownClusterSize);
+    }
   }
   LLVM::CConv callingConvention = gpuFuncOp.isKernel()
                                       ? kernelCallingConvention
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index ec74787b2a8ed..a3b2e04c35313 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -73,6 +73,9 @@ struct GPUFuncOpLoweringOptions {
   /// The attribute name to to set block size. Null if no attribute should be
   /// used.
   StringAttr kernelBlockSizeAttributeName;
+  /// The attribute name to to set cluster size. Null if no attribute should be
+  /// used.
+  StringAttr kernelClusterSizeAttributeName;
 
   /// The calling convention to use for kernel functions.
   LLVM::CConv kernelCallingConvention = LLVM::CConv::C;
@@ -93,6 +96,7 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
         workgroupAddrSpace(options.workgroupAddrSpace),
         kernelAttributeName(options.kernelAttributeName),
         kernelBlockSizeAttributeName(options.kernelBlockSizeAttributeName),
+        kernelClusterSizeAttributeName(options.kernelClusterSizeAttributeName),
         kernelCallingConvention(options.kernelCallingConvention),
         nonKernelCallingConvention(options.nonKernelCallingConvention),
         encodeWorkgroupAttributionsAsArguments(
@@ -114,6 +118,9 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
   /// The attribute name to to set block size. Null if no attribute should be
   /// used.
   StringAttr kernelBlockSizeAttributeName;
+  /// The attribute name to to set cluster size. Null if no attribute should be
+  /// used.
+  StringAttr kernelClusterSizeAttributeName;
 
   /// The calling convention to use for kernel functions
   LLVM::CConv kernelCallingConvention;
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index 91c43e8bd1117..ae0239132e7d0 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -17,7 +17,7 @@
 namespace mlir {
 namespace gpu {
 namespace index_lowering {
-enum class IndexKind : uint32_t { Other = 0, Block = 1, Grid = 2 };
+enum class IndexKind : uint32_t { Other = 0, Block = 1, Grid = 2, Cluster = 3 };
 enum class IntrType : uint32_t {
   None = 0,
   Id = 1,
@@ -92,6 +92,13 @@ struct OpLowering : public ConvertOpToLLVMPattern<Op> {
           funcBounds = gridHelper.getAttr(funcOp);
         break;
       }
+      case IndexKind::Cluster: {
+        auto clusterHelper =
+            gpu::GPUDialect::KnownClusterSizeAttrHelper(op.getContext());
+        if (clusterHelper.isAttrPresent(funcOp))
+          funcBounds = clusterHelper.getAttr(funcOp);
+        break;
+      }
       case IndexKind::Other:
         break;
       }
@@ -104,6 +111,9 @@ struct OpLowering : public ConvertOpToLLVMPattern<Op> {
       case IndexKind::Grid:
         funcBounds = gpuFunc.getKnownGridSizeAttr();
         break;
+      case IndexKind::Cluster:
+        funcBounds = gpuFunc.getKnownClusterSizeAttr();
+        break;
       case IndexKind::Other:
         break;
       }
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index c0480a1dfb512..01b34337647ec 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -533,7 +533,8 @@ void populateGpuToLLVMSPVConversionPatterns(
       GPUFuncOpLoweringOptions{
           privateAddressSpace, localAddressSpace,
           /*kernelAttributeName=*/{}, kernelBlockSizeAttributeName,
-          LLVM::CConv::SPIR_KERNEL, LLVM::CConv::SPIR_FUNC,
+          /*kernelClusterSizeAttributeName=*/{}, LLVM::CConv::SPIR_KERNEL,
+          LLVM::CConv::SPIR_FUNC,
           /*encodeWorkgroupAttributionsAsArguments=*/true});
 }
 
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 2561ca00d4b4f..6394296e99b9e 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -708,11 +708,11 @@ void mlir::populateGpuToNVVMConversionPatterns(
   patterns.add<gpu::index_lowering::OpLowering<
       gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
       NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>(
-      converter, IndexKind::Other, IntrType::Id, benefit);
+      converter, IndexKind::Cluster, IntrType::Id, benefit);
   patterns.add<gpu::index_lowering::OpLowering<
       gpu::ClusterDimBlocksOp, NVVM::ClusterDimBlocksXOp,
       NVVM::ClusterDimBlocksYOp, NVVM::ClusterDimBlocksZOp>>(
-      converter, IndexKind::Other, IntrType::Dim, benefit);
+      converter, IndexKind::Cluster, IntrType::Dim, benefit);
   patterns.add<gpu::index_lowering::OpLowering<
       gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>(
       converter, IndexKind::Grid, IntrType::Id, benefit);
@@ -737,7 +737,9 @@ void mlir::populateGpuToNVVMConversionPatterns(
           StringAttr::get(&converter.getContext(),
                           NVVM::NVVMDialect::getKernelFuncAttrName()),
           StringAttr::get(&converter.getContext(),
-                          NVVM::NVVMDialect::getMaxntidAttrName())},
+                          NVVM::NVVMDialect::getMaxntidAttrName()),
+          StringAttr::get(&converter.getContext(),
+                          NVVM::NVVMDialect::getClusterDimAttrName())},
       benefit);
 
   populateLibDeviceConversionPatterns(converter, patterns, benefit);
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 51741414d2060..b8eb6d7facc6d 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -458,7 +458,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
           /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
           /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
           rocdlDialect->getKernelAttrHelper().getName(),
-          rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
+          rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName(),
+          /*kernelClusterSizeAttributeName=*/{}});
   if (Runtime::HIP == runtime) {
     patterns.add<GPUPrintfOpToHIPLowering>(converter);
   } else if (Runtime::OpenCL == runtime) {
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 21c0d369b8d1c..36db6e82baaea 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -396,6 +396,8 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
     return verifyKnownLaunchSizeAttr(op, attr);
   if (attr.getName() == getKnownGridSizeAttrHelper().getName())
     return verifyKnownLaunchSizeAttr(op, attr);
+  if (attr.getName() == getKnownClusterSizeAttrHelper().getName())
+    return verifyKnownLaunchSizeAttr(op, attr);
   if (!llvm::isa<UnitAttr>(attr.getValue()) ||
       attr.getName() != getContainerModuleAttrName())
     return success();
diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
index bee3f392c91b5..263fcb96c17db 100644
--- a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
+++ b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -30,7 +30,7 @@ static ConstantIntRanges getIndexRange(uint64_t umin, uint64_t umax) {
 }
 
 namespace {
-enum class LaunchDims : uint32_t { Block = 0, Grid = 1 };
+enum class LaunchDims : uint32_t { Block = 0, Grid = 1, Cluster = 2 };
 } // end namespace
 
 /// If the operation `op` is in a context that is annotated with maximum
@@ -63,6 +63,9 @@ getKnownLaunchAttr(GPUFuncOp func, LaunchDims dims, Dimension dim) {
   case LaunchDims::Grid:
     bounds = func.getKnownGridSizeAttr();
     break;
+  case LaunchDims::Cluster:
+    bounds = func.getKnownClusterSizeAttr();
+    break;
   }
   if (!bounds)
     return std::nullopt;
@@ -94,6 +97,13 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
     case LaunchDims::Grid:
       bounds = launch.getGridSizeOperandValues();
       break;
+    case LaunchDims::Cluster:
+      if (launch.hasClusterSize()) {
+        auto clusterBounds = launch.getClusterSizeOperandValues();
+        if (clusterBounds)
+          bounds = *clusterBounds;
+      }
+      break;
     }
     Value maybeBound = valueByDim(bounds, dim);
     APInt value;
@@ -115,6 +125,9 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
     case LaunchDims::Grid:
       attrName = GPUDialect::KnownGridSizeAttrHelper::getNameStr();
       break;
+    case LaunchDims::Cluster:
+      attrName = GPUDialect::KnownClusterSizeAttrHelper::getNameStr();
+      break;
     }
     auto discardableAttr = getKnownLaunchAttr(func, attrName, dim);
     if (discardableAttr)
@@ -133,6 +146,9 @@ void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 
 void ClusterDimBlocksOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                            SetIntRangeFn setResultRange) {
+  if (auto known = getKnownLaunchDim(*this, LaunchDims::Cluster))
+    return setResultRange(getResult(), getIndexRange(*known, *known));
+
   uint64_t max = kMaxClusterDim;
   if (auto specified = getUpperBound())
     max = specified->getZExtValue();
@@ -150,6 +166,8 @@ void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
 void ClusterBlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                          SetIntRangeFn setResultRange) {
   uint64_t max = kMaxClusterDim;
+  if (auto known = getKnownLaunchDim(*this, LaunchDims::Cluster))
+    max = *known;
   if (auto specified = getUpperBound())
     max = specified->getZExtValue();
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index f1cc1eb983267..55ee508aa9f55 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1149,3 +1149,45 @@ gpu.module @test_module_56 {
     func.return %sin16, %cos16, %sin32, %cos32, %sin64, %cos64 : f16, f16, f32, f32, f64, f64
   }
 }
+
+// -----
+
+gpu.module @test_module_cluster_size {
+  // CHECK-LABEL: llvm.func @kernel_with_cluster_size()
+  // CHECK-SAME: nvvm.cluster_dim = array<i32: 8, 2, 4>
+  gpu.func @kernel_with_cluster_size() kernel attributes {known_cluster_size = array<i32: 8, 2, 4>} {
+    gpu.return
+  }
+}
+
+// -----
+
+gpu.module @test_module_cluster_block_ops {
+// CHECK-LABEL: llvm.func @kernel_with_cluster_size(
+// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr)
+// CHECK-SAME: gpu.known_cluster_size = array<i32: 8, 4, 2>
+  gpu.func @kernel_with_cluster_size(%arg0: !llvm.ptr) kernel attributes {known_cluster_size = array<i32: 8, 4, 2>} {
+    // CHECK: nvvm.read.ptx.sreg.cluster.ctaid.x range <i32, 0, 8> : i32
+    %0 = gpu.cluster_block_id x
+    // CHECK: nvvm.read.ptx.sreg.cluster.ctaid.y range <i32, 0, 4> : i32
+    %1 = gpu.cluster_block_id y
+    // CHECK: nvvm.read.ptx.sreg.cluster.ctaid.z range <i32, 0, 2> : i32
+    %2 = gpu.cluster_block_id z
+    // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.x range <i32, 1, 9> : i32
+    %3 = gpu.cluster_dim_blocks x
+    // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.y range <i32, 1, 5> : i32
+    %4 = gpu.cluster_dim_blocks y
+    // CHECK: nvvm.read.ptx.sreg.cluster.nctaid.z range <i32, 1, 3> : i32
+    %5 = gpu.cluster_dim_blocks z
+
+    %6 = arith.addi %0, %1 : index
+    %7 = arith.addi %6, %2 : index
+    %8 = arith.addi %7, %3 : index
+    %9 = arith.addi %8, %4 : index
+    %10 = arith.addi %9, %5 : index
+    %11 = arith.index_cast %10 : index to i64
+    llvm.store %11, %arg0 : i64, !llvm.ptr
+    gpu.return
+  }
+}
+
diff --git a/mlir/test/Dialect/GPU/int-range-interface-cluster.mlir b/mlir/test/Dialect/GPU/int-range-interface-cluster.mlir
new file mode 100644
index 0000000000000..a7dd0df2e2c13
--- /dev/null
+++ b/mlir/test/Dialect/GPU/int-range-interface-cluster.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt -int-range-optimizations %s | FileCheck %s
+ 
+gpu.module @test_module {
+  gpu.func @test_cluster_ranges() kernel attributes {known_cluster_size = array<i32: 8, 4, 1>} {
+    %c0 = gpu.cluster_block_id x
+    // CHECK: test.reflect_bounds {smax = 7 : index, smin = 0 : index, umax = 7 : index, umin = 0 : index}
+    %c0_0 = test.reflect_bounds %c0 : index
+    %c1 = gpu.cluster_block_id y
+    // CHECK: test.reflect_bounds {smax = 3 : index, smin = 0 : index, umax = 3 : index, umin = 0 : index}
+    %c1_0 = test.reflect_bounds %c1 : index
+    %c2 = gpu.cluster_block_id z
+    // CHECK: test.reflect_bounds {smax = 0 : index, smin = 0 : index, umax = 0 : index, umin = 0 : index}
+    %c2_0 = test.reflect_bounds %c2 : index
+
+    %d0 = gpu.cluster_dim_blocks x
+    // CHECK: test.reflect_bounds {smax = 8 : index, smin = 8 : index, umax = 8 : index, umin = 8 : index}
+    %d0_0 = test.reflect_bounds %d0 : index
+    %d1 = gpu.cluster_dim_blocks y
+    // CHECK: test.reflect_bounds {smax = 4 : index, smin = 4 : index, umax = 4 : index, umin = 4 : index}
+    %d1_0 = test.reflect_bounds %d1 : index
+    %d2 = gpu.cluster_dim_blocks z
+    // CHECK: test.reflect_bounds {smax = 1 : index, smin = 1 : index, umax = 1 : index, umin = 1 : index}
+    %d2_0 = test.reflect_bounds %d2 : index
+
+    gpu.return
+  }
+}