[Mlir-commits] [mlir] [mlir]introduce UnrollScopeInterface and apply it to funcOp and gpu.launch Op. (PR #123904)

Fri Jan 24 14:00:00 PST 2025

krzysz00 wrote:

```mlir
// example
func.func @gpu_launch_unroll() {
  %buf = gpu.alloc() : memref<2x4x2x2xf16, #gpu.address_space<global>>
  %c0 = arith.constant 0 : index
  %c1 = arith.constant 1 : index
  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
    %cst = arith.constant dense<0.000000e+00> : vector<2x4x2x2xf16>
    %0 = affine.for %arg12 = 0 to 2 iter_args(%arg13 = %cst) -> (vector<2x4x2x2xf16>) {
      %1 = affine.for %arg14 = 0 to 4 iter_args(%arg15 = %arg13) -> (vector<2x4x2x2xf16>) {
        %cst_0 = arith.constant dense<0.000000e+00> : vector<2x2xf16>
        %2 = vector.insert %cst_0, %arg15 [%arg12, %arg14] : vector<2x2xf16> into vector<2x4x2x2xf16>
        affine.yield %2 : vector<2x4x2x2xf16>
      }
      affine.yield %1 : vector<2x4x2x2xf16>
    }
    vector.transfer_write %0, %buf[%c0, %c0, %c0, %c0] {inbounds = [true, true, true, true]} : vector<2x4x2x2xf16>, memref<2x4x2x2xf16, #gpu.address_space<global>>
    gpu.terminator
  }
  gpu.dealloc %buf : memref<2x4x2x2xf16, #gpu.address_space<global>>
  return
}
```
If I run this through `mlir-opt -affine-loop-unroll="unroll-full" -canonicalize -cse -gpu-launch-sink-index-computations -gpu-kernel-outlining -canonicalize` it gives
```
module attributes {gpu.container_module} {
  func.func @gpu_launch_unroll() {
    %c1 = arith.constant 1 : index
    %memref = gpu.alloc  () : memref<2x4x2x2xf16, #gpu.address_space<global>>
    gpu.launch_func  @gpu_launch_unroll_kernel::@gpu_launch_unroll_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)  args(%memref : memref<2x4x2x2xf16, #gpu.address_space<global>>)
    gpu.dealloc  %memref : memref<2x4x2x2xf16, #gpu.address_space<global>>
    return
  }
  gpu.module @gpu_launch_unroll_kernel {
    gpu.func @gpu_launch_unroll_kernel(%arg0: memref<2x4x2x2xf16, #gpu.address_space<global>>) kernel attributes {known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 1, 1, 1>} {
      %cst = arith.constant dense<0.000000e+00> : vector<2x4x2x2xf16>
      %c3 = arith.constant 3 : index
      %c2 = arith.constant 2 : index
      %c1 = arith.constant 1 : index
      %c0 = arith.constant 0 : index
      %cst_0 = arith.constant dense<0.000000e+00> : vector<2x2xf16>
      %0 = affine.for %arg1 = 0 to 2 iter_args(%arg2 = %cst) -> (vector<2x4x2x2xf16>) {
        %1 = vector.insert %cst_0, %arg2 [%arg1, %c0] : vector<2x2xf16> into vector<2x4x2x2xf16>
        %2 = vector.insert %cst_0, %1 [%arg1, %c1] : vector<2x2xf16> into vector<2x4x2x2xf16>
        %3 = vector.insert %cst_0, %2 [%arg1, %c2] : vector<2x2xf16> into vector<2x4x2x2xf16>
        %4 = vector.insert %cst_0, %3 [%arg1, %c3] : vector<2x2xf16> into vector<2x4x2x2xf16>
        affine.yield %4 : vector<2x4x2x2xf16>
      }
      vector.transfer_write %0, %arg0[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true], inbounds = [true, true, true, true]} : vector<2x4x2x2xf16>, memref<2x4x2x2xf16, #gpu.address_space<global>>
      gpu.return
    }
  }
}
```


https://github.com/llvm/llvm-project/pull/123904