[Mlir-commits] [mlir] [MLIR][GPU] Add support for non-portable cluster size attribute (PR #95545)

Fri Jun 14 06:49:29 PDT 2024

================
@@ -0,0 +1,124 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_90a cubin-features=+ptx80 opt-level=3" \
+// RUN:  | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+// CHECK: clusterIdx: (3, 3, 0) in Cluster Dimension: (4, 4, 1) blockIdx: (15, 15, 0)
+// CHECK: clusterIdx: (3, 3, 0) in Cluster Dimension: (4, 4, 1) blockIdx: (15, 15, 0)
+
+module attributes {gpu.container_module} {
+gpu.module @gpumodule {
+  gpu.func @kernel_cluster() kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 1>} {
+    %cidX = gpu.cluster_id  x
+    %cidY = gpu.cluster_id  y
+    %cidZ = gpu.cluster_id  z
+    %cdimX = gpu.cluster_dim_blocks  x
+    %cdimY = gpu.cluster_dim_blocks  y
+    %cdimZ = gpu.cluster_dim_blocks  z
+    %bidX = gpu.block_id  x
+    %bidY = gpu.block_id  y
+    %bidZ = gpu.block_id  z
+    %cidX_i32 = index.casts %cidX : index to i32
+    %cidY_i32 = index.casts %cidY : index to i32
+    %cidZ_i32 = index.casts %cidZ : index to i32
+    %cdimX_i32 = index.casts %cdimX : index to i32
+    %cdimY_i32 = index.casts %cdimY : index to i32
+    %cdimZ_i32 = index.casts %cdimZ : index to i32
+    %bidX_i32 = index.casts %bidX : index to i32
+    %bidY_i32 = index.casts %bidY : index to i32
+    %bidZ_i32 = index.casts %bidZ : index to i32
+
+    %c_1 = arith.constant -1 : index
+    %cBlocksX = gpu.grid_dim x
+    %cN_1 = arith.addi %cBlocksX, %c_1 : index
+    %cnd1 =  arith.cmpi eq, %bidX, %cN_1 : index
+    %cnd2 =  arith.cmpi eq, %bidY, %cN_1 : index
+    scf.if %cnd1 {
+      scf.if %cnd2 {
+        gpu.printf "clusterIdx: (%d, %d, %d) in Cluster Dimension: (%d, %d, %d) blockIdx: (%d, %d, %d) \n"
+          %cidX_i32,
+          %cidY_i32,
+          %cidZ_i32,
+          %cdimX_i32,
+          %cdimY_i32,
+          %cdimZ_i32,
+          %bidX_i32,
+          %bidY_i32,
+          %bidZ_i32
+          :
+          i32, i32, i32, i32, i32, i32, i32, i32, i32
+      }
+    }
+    gpu.return
+  }
+}
+
+func.func @main() {
+  %cDimX = arith.constant 4 : index
+  %cDimY = arith.constant 4 : index
+  %cDimZ = arith.constant 1 : index
+  %gDimX = arith.constant 16 : index
+  %gDimY = arith.constant 16 : index
+  %gDimZ = arith.constant 1 : index
+  %bDimX = arith.constant 1 : index
+  %bDimY = arith.constant 1 : index
+  %bDimZ = arith.constant 1 : index
+
+  gpu.launch clusters(%cx, %cy, %cz) in (%cluster_x = %cDimX, %cluster_y = %cDimY,
----------------
grypp wrote:

Can we actually test with cluster more than 8? 

https://github.com/llvm/llvm-project/pull/95545