[Mlir-commits] [mlir] [mlir]introduce UnrollScopeInterface and apply it to funcOp and gpu.launch Op. (PR #123904)

Wed Jan 22 00:15:01 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-mlir

Author: lonely eagle (linuxlonelyeagle)

<details>
<summary>Changes</summary>

When using `affine-loop-unroll` when using `gpu.launch`, `redundant SSA values` will be introduced in `gpu.launch`.
```
// example
module {
  func.func @main() {
    %c1 = arith.constant 1 : index
    gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
      %cst = arith.constant dense<0.000000e+00> : vector<2x4x2x2xf16>
      %cst_0 = arith.constant dense<0.000000e+00> : vector<2x4x2xf16>
      %cst_1 = arith.constant dense<0.000000e+00> : vector<4x2x2xf16>
      %0 = affine.for %arg12 = 0 to 2 iter_args(%arg13 = %cst) -> (vector<2x4x2x2xf16>) {
        %1 = affine.for %arg14 = 0 to 4 iter_args(%arg15 = %arg13) -> (vector<2x4x2x2xf16>) {
          %2 = vector.extract %cst_0[%arg12] : vector<4x2xf16> from vector<2x4x2xf16>
          %3 = vector.extract %cst_1[%arg14] : vector<2x2xf16> from vector<4x2x2xf16>
          %4 = vector.extract %arg15[%arg12, %arg14] : vector<2x2xf16> from vector<2x4x2x2xf16>
          %cst_2 = arith.constant dense<0.000000e+00> : vector<2x2xf16>
          %5 = vector.insert %cst_2, %arg13 [%arg12, %arg14] : vector<2x2xf16> into vector<2x4x2x2xf16>
          affine.yield %5 : vector<2x4x2x2xf16>
        }
        affine.yield %1 : vector<2x4x2x2xf16>
      }
      gpu.terminator
    }
    return
  }
}
```

```
mlir-opt test-affine-loops-unroll.mlir -affine-loop-unroll="unroll-full" -gpu-kernel-outlining                   root@e3f83748ef6b
#map = affine_map<(d0) -> (d0 + 1)>
#map1 = affine_map<(d0) -> (d0 + 2)>
#map2 = affine_map<(d0) -> (d0 + 3)>
module attributes {gpu.container_module} {
  func.func @main() {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    gpu.launch_func  @main_kernel::@main_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1)  args(%c0 : index)
    return
  }
  gpu.module @main_kernel {
    gpu.func @main_kernel(%arg0: index) kernel attributes {known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 1, 1, 1>} {
      %block_id_x = gpu.block_id  x
      %block_id_y = gpu.block_id  y
      %block_id_z = gpu.block_id  z
      %thread_id_x = gpu.thread_id  x
      %thread_id_y = gpu.thread_id  y
      %thread_id_z = gpu.thread_id  z
      %grid_dim_x = gpu.grid_dim  x
      %grid_dim_y = gpu.grid_dim  y
      %grid_dim_z = gpu.grid_dim  z
      %block_dim_x = gpu.block_dim  x
      %block_dim_y = gpu.block_dim  y
      %block_dim_z = gpu.block_dim  z
      %cst = arith.constant dense<0.000000e+00> : vector<2x4x2x2xf16>
      %cst_0 = arith.constant dense<0.000000e+00> : vector<2x4x2xf16>
      %cst_1 = arith.constant dense<0.000000e+00> : vector<4x2x2xf16>
      %0 = affine.for %arg1 = 0 to 2 iter_args(%arg2 = %cst) -> (vector<2x4x2x2xf16>) {
        %1 = vector.extract %cst_0[%arg1] : vector<4x2xf16> from vector<2x4x2xf16>
        %2 = vector.extract %cst_1[%arg0] : vector<2x2xf16> from vector<4x2x2xf16>
        %3 = vector.extract %arg2[%arg1, %arg0] : vector<2x2xf16> from vector<2x4x2x2xf16>
        %cst_2 = arith.constant dense<0.000000e+00> : vector<2x2xf16>
        %4 = vector.insert %cst_2, %arg2 [%arg1, %arg0] : vector<2x2xf16> into vector<2x4x2x2xf16>
        %5 = affine.apply #map(%arg0)
        %6 = vector.extract %cst_0[%arg1] : vector<4x2xf16> from vector<2x4x2xf16>
        %7 = vector.extract %cst_1[%5] : vector<2x2xf16> from vector<4x2x2xf16>
        %8 = vector.extract %4[%arg1, %5] : vector<2x2xf16> from vector<2x4x2x2xf16>
        %cst_3 = arith.constant dense<0.000000e+00> : vector<2x2xf16>
        %9 = vector.insert %cst_3, %arg2 [%arg1, %5] : vector<2x2xf16> into vector<2x4x2x2xf16>
        %10 = affine.apply #map1(%arg0)
        %11 = vector.extract %cst_0[%arg1] : vector<4x2xf16> from vector<2x4x2xf16>
        %12 = vector.extract %cst_1[%10] : vector<2x2xf16> from vector<4x2x2xf16>
        %13 = vector.extract %9[%arg1, %10] : vector<2x2xf16> from vector<2x4x2x2xf16>
        %cst_4 = arith.constant dense<0.000000e+00> : vector<2x2xf16>
        %14 = vector.insert %cst_4, %arg2 [%arg1, %10] : vector<2x2xf16> into vector<2x4x2x2xf16>
        %15 = affine.apply #map2(%arg0)
        %16 = vector.extract %cst_0[%arg1] : vector<4x2xf16> from vector<2x4x2xf16>
        %17 = vector.extract %cst_1[%15] : vector<2x2xf16> from vector<4x2x2xf16>
        %18 = vector.extract %14[%arg1, %15] : vector<2x2xf16> from vector<2x4x2x2xf16>
        %cst_5 = arith.constant dense<0.000000e+00> : vector<2x2xf16>
        %19 = vector.insert %cst_5, %arg2 [%arg1, %15] : vector<2x2xf16> into vector<2x4x2x2xf16>
        affine.yield %19 : vector<2x4x2x2xf16>
      }
      gpu.return
    }
  }
}
```
This PR fixes this issue.Feel free to comment below, thank you.

---
Full diff: https://github.com/llvm/llvm-project/pull/123904.diff


12 Files Affected:

- (modified) mlir/include/mlir/Dialect/Func/IR/FuncOps.h (+1) 
- (modified) mlir/include/mlir/Dialect/Func/IR/FuncOps.td (+3-2) 
- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h (+1) 
- (modified) mlir/include/mlir/Dialect/GPU/IR/GPUOps.td (+2-1) 
- (modified) mlir/include/mlir/Interfaces/CMakeLists.txt (+1) 
- (added) mlir/include/mlir/Interfaces/UnrollScopeInterface.h (+21) 
- (added) mlir/include/mlir/Interfaces/UnrollScopeInterface.td (+36) 
- (modified) mlir/lib/Dialect/Affine/Utils/CMakeLists.txt (+1) 
- (modified) mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp (+4-3) 
- (modified) mlir/lib/Interfaces/CMakeLists.txt (+2) 
- (added) mlir/lib/Interfaces/UnrollScopeInterface.cpp (+18) 
- (modified) mlir/test/Dialect/Affine/unroll.mlir (+60) 


``````````diff

diff --git a/mlir/include/mlir/Dialect/Func/IR/FuncOps.h b/mlir/include/mlir/Dialect/Func/IR/FuncOps.h
index 5e10a9f50b774e..3f5566a28546d1 100644
--- a/mlir/include/mlir/Dialect/Func/IR/FuncOps.h
+++ b/mlir/include/mlir/Dialect/Func/IR/FuncOps.h
@@ -20,6 +20,7 @@
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/UnrollScopeInterface.h"
 
 namespace mlir {
 class PatternRewriter;
diff --git a/mlir/include/mlir/Dialect/Func/IR/FuncOps.td b/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
index 4da0efcb13ddf5..5c9f8c6a59f8f6 100644
--- a/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
+++ b/mlir/include/mlir/Dialect/Func/IR/FuncOps.td
@@ -17,6 +17,7 @@ include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/FunctionInterfaces.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/UnrollScopeInterface.td"
 
 def Func_Dialect : Dialect {
   let name = "func";
@@ -225,8 +226,8 @@ def ConstantOp : Func_Op<"constant",
 //===----------------------------------------------------------------------===//
 
 def FuncOp : Func_Op<"func", [
-  AffineScope, AutomaticAllocationScope,
-  FunctionOpInterface, IsolatedFromAbove, OpAsmOpInterface
+  AffineScope, AutomaticAllocationScope, FunctionOpInterface,
+  IsolatedFromAbove, OpAsmOpInterface, UnrollScopeInterface
 ]> {
   let summary = "An operation with a name containing a single `SSACFG` region";
   let description = [{
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
index 7b53594a1c8e28..0cf2d0c77383f1 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -29,6 +29,7 @@
 #include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/UnrollScopeInterface.h"
 #include "llvm/ADT/STLExtras.h"
 
 namespace mlir {
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 3adfd5f4f2c436..8279bb9985ea3e 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -30,6 +30,7 @@ include "mlir/Interfaces/FunctionInterfaces.td"
 include "mlir/Interfaces/InferIntRangeInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/UnrollScopeInterface.td"
 
 //===----------------------------------------------------------------------===//
 // GPU Dialect operations.
@@ -796,7 +797,7 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
 def GPU_LaunchOp : GPU_Op<"launch", [
       AutomaticAllocationScope, AttrSizedOperandSegments, GPU_AsyncOpInterface,
       DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,
-      RecursiveMemoryEffects]>,
+      RecursiveMemoryEffects, UnrollScopeInterface]>,
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
                Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
diff --git a/mlir/include/mlir/Interfaces/CMakeLists.txt b/mlir/include/mlir/Interfaces/CMakeLists.txt
index d81298bb4daf01..cd6cc084dd2801 100644
--- a/mlir/include/mlir/Interfaces/CMakeLists.txt
+++ b/mlir/include/mlir/Interfaces/CMakeLists.txt
@@ -17,6 +17,7 @@ add_mlir_interface(TilingInterface)
 add_mlir_interface(ValueBoundsOpInterface)
 add_mlir_interface(VectorInterfaces)
 add_mlir_interface(ViewLikeInterface)
+add_mlir_interface(UnrollScopeInterface)
 
 set(LLVM_TARGET_DEFINITIONS MemorySlotInterfaces.td)
 mlir_tablegen(MemorySlotOpInterfaces.h.inc -gen-op-interface-decls)
diff --git a/mlir/include/mlir/Interfaces/UnrollScopeInterface.h b/mlir/include/mlir/Interfaces/UnrollScopeInterface.h
new file mode 100644
index 00000000000000..f7d71b6f9be654
--- /dev/null
+++ b/mlir/include/mlir/Interfaces/UnrollScopeInterface.h
@@ -0,0 +1,21 @@
+//===- UnrollScopeInterface.h - unroll region interface -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the operation interface for unroll region
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_INTERFACES_UNROLLSCOPEINTERFACE_H_
+#define MLIR_INTERFACES_UNROLLSCOPEINTERFACE_H_
+
+#include "mlir/IR/OpDefinition.h"
+
+/// Include the generated interface declarations.
+#include "mlir/Interfaces/UnrollScopeInterface.h.inc"
+
+#endif // MLIR_INTERFACES_UNROLLSCOPEINTERFACE_H_
diff --git a/mlir/include/mlir/Interfaces/UnrollScopeInterface.td b/mlir/include/mlir/Interfaces/UnrollScopeInterface.td
new file mode 100644
index 00000000000000..5ad5e5b44cfe14
--- /dev/null
+++ b/mlir/include/mlir/Interfaces/UnrollScopeInterface.td
@@ -0,0 +1,36 @@
+//===- UnrollScopeInterface.td - unroll scope interface ----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the interface for unroll region.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_INTERFACES_UNROLLSCOPEINTERFACE
+#define MLIR_INTERFACES_UNROLLSCOPEINTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def UnrollScopeInterface : OpInterface<"UnrollScopeInterface"> {
+  let description = [{
+    This interface controls the scope of the loop unroll.It ensures
+    that SSA values generated outside the loop when unrolling are 
+    in the nearest `UnrollScopeInterface` region.
+  }];
+  let cppNamespace = "::mlir";
+  let methods = [
+    InterfaceMethod<[{
+      return the `UnrollScopeInterface` region.
+    }],
+    "::mlir::Region&", "getUnrollBody", (ins),
+    /*methodBody=*/[{}], /*defaultImplementation=*/[{
+      return $_op->getRegion(0);
+    }]>,
+  ];
+}
+
+#endif // MLIR_INTERFACES_UNROLLSCOPEINTERFACE
diff --git a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
index ef6e0dbf45d3a9..d2993a424060f8 100644
--- a/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Affine/Utils/CMakeLists.txt
@@ -16,4 +16,5 @@ add_mlir_dialect_library(MLIRAffineUtils
   MLIRMemRefDialect
   MLIRTransformUtils
   MLIRViewLikeInterface
+  MLIRUnrollScopeInterface
   )
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
index 4e02559a089493..c697d3b0127a83 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/IntegerSet.h"
+#include "mlir/Interfaces/UnrollScopeInterface.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -129,10 +130,10 @@ LogicalResult mlir::affine::promoteIfSingleIteration(AffineForOp forOp) {
   auto *parentBlock = forOp->getBlock();
   if (!iv.use_empty()) {
     if (forOp.hasConstantLowerBound()) {
-      auto func = forOp->getParentOfType<FunctionOpInterface>();
+      auto unrollScope = forOp->getParentOfType<UnrollScopeInterface>();
       OpBuilder builder(forOp->getContext());
-      if (func)
-        builder.setInsertionPointToStart(&func.getFunctionBody().front());
+      if (unrollScope)
+        builder.setInsertionPointToStart(&unrollScope.getUnrollBody().front());
       else
         builder.setInsertionPoint(forOp);
       auto constOp = builder.create<arith::ConstantIndexOp>(
diff --git a/mlir/lib/Interfaces/CMakeLists.txt b/mlir/lib/Interfaces/CMakeLists.txt
index d3b7bf65ad3e73..bd0b79aaecab0c 100644
--- a/mlir/lib/Interfaces/CMakeLists.txt
+++ b/mlir/lib/Interfaces/CMakeLists.txt
@@ -21,6 +21,7 @@ set(LLVM_OPTIONAL_SOURCES
   ValueBoundsOpInterface.cpp
   VectorInterfaces.cpp
   ViewLikeInterface.cpp
+  UnrollScopeInterface.cpp
   )
 
 function(add_mlir_interface_library name)
@@ -46,6 +47,7 @@ add_mlir_interface_library(CopyOpInterface)
 add_mlir_interface_library(DataLayoutInterfaces)
 add_mlir_interface_library(DerivedAttributeOpInterface)
 add_mlir_interface_library(DestinationStyleOpInterface)
+add_mlir_interface_library(UnrollScopeInterface)
 
 add_mlir_library(MLIRFunctionInterfaces
   FunctionInterfaces.cpp
diff --git a/mlir/lib/Interfaces/UnrollScopeInterface.cpp b/mlir/lib/Interfaces/UnrollScopeInterface.cpp
new file mode 100644
index 00000000000000..b500f5ad01f2ad
--- /dev/null
+++ b/mlir/lib/Interfaces/UnrollScopeInterface.cpp
@@ -0,0 +1,18 @@
+//===- UnrollScopeInterface.cpp - unroll scope interface in MLIR ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Interfaces/UnrollScopeInterface.h"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// UnrollScopeInterface Interface
+//===----------------------------------------------------------------------===//
+
+/// Include the definitions of the unroll scope interface.
+#include "mlir/Interfaces/UnrollScopeInterface.cpp.inc"
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index e398c3fe2011dd..4fdaa1a7405d1a 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -240,6 +240,66 @@ func.func @loop_nest_unroll_full() {
   return
 } // UNROLL-FULL }
 
+
+// UNROLL-FULL-LABEL: func @gpu_launch_unroll() {
+
+func.func @gpu_launch_unroll() {
+  %c1 = arith.constant 1 : index
+  gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
+    %cst = arith.constant dense<0.000000e+00> : vector<2x4x2x2xf16>
+    %cst_0 = arith.constant dense<0.000000e+00> : vector<2x4x2xf16>
+    %cst_1 = arith.constant dense<0.000000e+00> : vector<4x2x2xf16>
+    %0 = affine.for %arg12 = 0 to 2 iter_args(%arg13 = %cst) -> (vector<2x4x2x2xf16>) {
+      %1 = affine.for %arg14 = 0 to 4 iter_args(%arg15 = %arg13) -> (vector<2x4x2x2xf16>) {
+        %2 = vector.extract %cst_0[%arg12] : vector<4x2xf16> from vector<2x4x2xf16>
+        %3 = vector.extract %cst_1[%arg14] : vector<2x2xf16> from vector<4x2x2xf16>
+        %4 = vector.extract %arg15[%arg12, %arg14] : vector<2x2xf16> from vector<2x4x2x2xf16>
+        %cst_2 = arith.constant dense<0.000000e+00> : vector<2x2xf16>
+        %5 = vector.insert %cst_2, %arg13 [%arg12, %arg14] : vector<2x2xf16> into vector<2x4x2x2xf16>
+        affine.yield %5 : vector<2x4x2x2xf16>
+      }
+      affine.yield %1 : vector<2x4x2x2xf16>
+    }
+    gpu.terminator
+  }
+  return
+}
+
+// UNROLL-FULL: %[[VAL_0:.*]] = arith.constant 1 : index
+// UNROLL-FULL: gpu.launch blocks(%[[VAL_1:.*]], %[[VAL_2:.*]], %[[VAL_3:.*]]) in (%[[VAL_4:.*]] = %[[VAL_0]], %[[VAL_5:.*]] = %[[VAL_0]], %[[VAL_6:.*]] = %[[VAL_0]]) threads(%[[VAL_7:.*]], %[[VAL_8:.*]], %[[VAL_9:.*]]) in (%[[VAL_10:.*]] = %[[VAL_0]], %[[VAL_11:.*]] = %[[VAL_0]], %[[VAL_12:.*]] = %[[VAL_0]]) {
+// UNROLL-FULL:   %[[VAL_13:.*]] = arith.constant 0 : index
+// UNROLL-FULL:   %[[VAL_14:.*]] = arith.constant dense<0.000000e+00> : vector<2x4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_15:.*]] = arith.constant dense<0.000000e+00> : vector<2x4x2xf16>
+// UNROLL-FULL:   %[[VAL_16:.*]] = arith.constant dense<0.000000e+00> : vector<4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_17:.*]] = affine.for %[[VAL_18:.*]] = 0 to 2 iter_args(%[[VAL_19:.*]] = %[[VAL_14]]) -> (vector<2x4x2x2xf16>) {
+// UNROLL-FULL:   %[[VAL_20:.*]] = vector.extract %[[VAL_15]]{{\[}}%[[VAL_18]]] : vector<4x2xf16> from vector<2x4x2xf16>
+// UNROLL-FULL:   %[[VAL_21:.*]] = vector.extract %[[VAL_16]]{{\[}}%[[VAL_13]]] : vector<2x2xf16> from vector<4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_22:.*]] = vector.extract %[[VAL_19]]{{\[}}%[[VAL_18]], %[[VAL_13]]] : vector<2x2xf16> from vector<2x4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_23:.*]] = arith.constant dense<0.000000e+00> : vector<2x2xf16>
+// UNROLL-FULL:   %[[VAL_24:.*]] = vector.insert %[[VAL_23]], %[[VAL_19]] {{\[}}%[[VAL_18]], %[[VAL_13]]] : vector<2x2xf16> into vector<2x4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_25:.*]] = affine.apply [[$MAP0]](%[[VAL_13]])
+// UNROLL-FULL:   %[[VAL_26:.*]] = vector.extract %[[VAL_15]]{{\[}}%[[VAL_18]]] : vector<4x2xf16> from vector<2x4x2xf16>
+// UNROLL-FULL:   %[[VAL_27:.*]] = vector.extract %[[VAL_16]]{{\[}}%[[VAL_25]]] : vector<2x2xf16> from vector<4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_28:.*]] = vector.extract %[[VAL_24]]{{\[}}%[[VAL_18]], %[[VAL_25]]] : vector<2x2xf16> from vector<2x4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_29:.*]] = arith.constant dense<0.000000e+00> : vector<2x2xf16>
+// UNROLL-FULL:   %[[VAL_30:.*]] = vector.insert %[[VAL_29]], %[[VAL_19]] {{\[}}%[[VAL_18]], %[[VAL_25]]] : vector<2x2xf16> into vector<2x4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_31:.*]] = affine.apply [[$MAP1]](%[[VAL_13]])
+// UNROLL-FULL:   %[[VAL_32:.*]] = vector.extract %[[VAL_15]]{{\[}}%[[VAL_18]]] : vector<4x2xf16> from vector<2x4x2xf16>
+// UNROLL-FULL:   %[[VAL_33:.*]] = vector.extract %[[VAL_16]]{{\[}}%[[VAL_31]]] : vector<2x2xf16> from vector<4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_34:.*]] = vector.extract %[[VAL_30]]{{\[}}%[[VAL_18]], %[[VAL_31]]] : vector<2x2xf16> from vector<2x4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_35:.*]] = arith.constant dense<0.000000e+00> : vector<2x2xf16>
+// UNROLL-FULL:   %[[VAL_36:.*]] = vector.insert %[[VAL_35]], %[[VAL_19]] {{\[}}%[[VAL_18]], %[[VAL_31]]] : vector<2x2xf16> into vector<2x4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_37:.*]] = affine.apply [[$MAP2]](%[[VAL_13]])
+// UNROLL-FULL:   %[[VAL_38:.*]] = vector.extract %[[VAL_15]]{{\[}}%[[VAL_18]]] : vector<4x2xf16> from vector<2x4x2xf16>
+// UNROLL-FULL:   %[[VAL_39:.*]] = vector.extract %[[VAL_16]]{{\[}}%[[VAL_37]]] : vector<2x2xf16> from vector<4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_40:.*]] = vector.extract %[[VAL_36]]{{\[}}%[[VAL_18]], %[[VAL_37]]] : vector<2x2xf16> from vector<2x4x2x2xf16>
+// UNROLL-FULL:   %[[VAL_41:.*]] = arith.constant dense<0.000000e+00> : vector<2x2xf16>
+// UNROLL-FULL:   %[[VAL_42:.*]] = vector.insert %[[VAL_41]], %[[VAL_19]] {{\[}}%[[VAL_18]], %[[VAL_37]]] : vector<2x2xf16> into vector<2x4x2x2xf16>
+// UNROLL-FULL:   affine.yield %[[VAL_42]] : vector<2x4x2x2xf16>
+// UNROLL-FULL: }
+// UNROLL-FULL: gpu.terminator
+
+
 // SHORT-LABEL: func @loop_nest_outer_unroll() {
 func.func @loop_nest_outer_unroll() {
   // SHORT:      affine.for %arg0 = 0 to 4 {

``````````

</details>


https://github.com/llvm/llvm-project/pull/123904