[Mlir-commits] [mlir] [MLIR][Vector] Refine transfer in_bounds inference for indexed accesses (PR #193920)

Tue May 5 05:08:37 PDT 2026

https://github.com/zackc6 updated https://github.com/llvm/llvm-project/pull/193920

>From 7b863e908a306d01a39defd69286cabb84924c97 Mon Sep 17 00:00:00 2001
From: zack <zackchen666 at gmail.com>
Date: Fri, 24 Apr 2026 14:53:38 +0800
Subject: [PATCH] [MLIR][Vector] Refine transfer in_bounds inference for
 indexed accesses

Compute transfer `in_bounds` from mapped indices and static shapes, with explicit read/write dimension alignment. Add bounded `scf.for` IV handling and a conservative fallback for unknown indices to preserve existing loop lowering behavior. Extend vectorization tests for insert-slice and transform-vector pipelines, including IV-indexed cases.
---
 mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp | 159 ++++++++++++++++--
 .../insert-slice-with-patterns.mlir           |  97 +++++++++++
 .../test/Dialect/Vector/transform-vector.mlir |  67 ++++++++
 3 files changed, 306 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
index 576023dbc9de1..13f73af057651 100644
--- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
+++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -406,6 +407,137 @@ static bool isMaskTriviallyFoldable(SmallVector<OpFoldResult> &maskSizes,
   return true;
 }
 
+/// Returns true if `index` is an scf.for IV that is statically proven in-bounds
+/// for accesses of length `vecSize` into a static dimension `baseDimSize`.
+///
+/// This handles the common vector loop form:
+///   scf.for %iv = %c0 to %ub step %c<vecSize>
+/// and proves `%iv + vecSize <= baseDimSize` for all iterations.
+static bool isKnownInBoundsForLoopIV(Value index, int64_t vecSize,
+                                     int64_t baseDimSize) {
+  auto iv = dyn_cast<BlockArgument>(index);
+  if (!iv)
+    return false;
+
+  auto forOp = dyn_cast<scf::ForOp>(iv.getOwner()->getParentOp());
+  if (!forOp || forOp.getInductionVar() != index)
+    return false;
+
+  APSInt lbInt, ubInt, stepInt;
+  if (!matchPattern(forOp.getLowerBound(), m_ConstantInt(&lbInt)) ||
+      !matchPattern(forOp.getUpperBound(), m_ConstantInt(&ubInt)) ||
+      !matchPattern(forOp.getStep(), m_ConstantInt(&stepInt)))
+    return false;
+
+  int64_t lb = lbInt.getSExtValue();
+  int64_t ub = ubInt.getSExtValue();
+  int64_t step = stepInt.getSExtValue();
+  if (lb != 0 || step <= 0 || vecSize <= 0 || step != vecSize)
+    return false;
+
+  // Empty loop executes no transfer accesses.
+  if (lb >= ub)
+    return true;
+
+  if (baseDimSize < vecSize)
+    return false;
+
+  // With lb = 0 and step = vecSize, proving ub <= baseDimSize is sufficient to
+  // guarantee iv + vecSize <= baseDimSize on all iterations.
+  return ub <= baseDimSize;
+}
+
+/// Returns `in_bounds` values derived from static shapes and constant indices.
+/// Conservatively returns false for any dimension that cannot be proven safe.
+///
+/// `baseDims` and `alignedIndices` must be aligned with `vectorShape`:
+///   * `baseDims[vecDim]` is the base tensor dim checked for `vecDim`.
+///   * `alignedIndices[vecDim]` is the index used for that mapped base dim.
+static SmallVector<bool> computeInBoundsFromStaticShapeAndIndices(
+    ArrayRef<int64_t> baseShape, ArrayRef<int64_t> vectorShape,
+    ArrayRef<int64_t> baseDims, ArrayRef<Value> alignedIndices,
+    bool unknownIndexRequiresExactSize) {
+  SmallVector<bool> inBounds(vectorShape.size(), false);
+  if (baseDims.size() != vectorShape.size() ||
+      alignedIndices.size() != vectorShape.size())
+    return inBounds;
+
+  for (auto [vecDim, vecSize] : llvm::enumerate(vectorShape)) {
+    int64_t baseDim = baseDims[vecDim];
+    Value index = alignedIndices[vecDim];
+    if (!ShapedType::isStatic(baseShape[baseDim]) || vecSize < 0)
+      continue;
+
+    int64_t baseDimSize = baseShape[baseDim];
+    APSInt indexValue;
+    if (matchPattern(index, m_ConstantInt(&indexValue))) {
+      int64_t idx = indexValue.getSExtValue();
+      // Indexes must be non-negative and stay in range for the whole vector
+      // dim. Compare as `idx <= baseDimSize - vecSize` to avoid signed
+      // overflow in `idx + vecSize`.
+      inBounds[vecDim] =
+          idx >= 0 && idx <= baseDimSize && vecSize <= baseDimSize - idx;
+      continue;
+    }
+
+    if (isKnownInBoundsForLoopIV(index, vecSize, baseDimSize)) {
+      inBounds[vecDim] = true;
+      continue;
+    }
+
+    // Relax unknown-index handling to preserve previous behavior for
+    // loop-structured code paths while still preferring index-aware proofs.
+    inBounds[vecDim] = unknownIndexRequiresExactSize ? (baseDimSize == vecSize)
+                                                     : (baseDimSize >= vecSize);
+  }
+  return inBounds;
+}
+
+/// Compute in_bounds for transfer_read where source and vector ranks match.
+static SmallVector<bool>
+computeReadInBoundsFromStaticShape(ArrayRef<int64_t> sourceShape,
+                                   ArrayRef<int64_t> vectorShape,
+                                   ArrayRef<Value> readIndices) {
+  if (sourceShape.size() != vectorShape.size() ||
+      readIndices.size() != vectorShape.size())
+    return SmallVector<bool>(vectorShape.size(), false);
+
+  SmallVector<int64_t> sourceDims;
+  sourceDims.reserve(vectorShape.size());
+  for (auto [dim, _] : llvm::enumerate(vectorShape))
+    sourceDims.push_back(dim);
+
+  return computeInBoundsFromStaticShapeAndIndices(
+      sourceShape, vectorShape, sourceDims, readIndices,
+      /*unknownIndexRequiresExactSize=*/true);
+}
+
+/// Compute in_bounds for transfer_write where write indices are expressed in
+/// destination-rank coordinates.
+static SmallVector<bool>
+computeWriteInBoundsFromStaticShape(ArrayRef<int64_t> destShape,
+                                    ArrayRef<int64_t> vectorShape,
+                                    ArrayRef<Value> writeIndices) {
+  if (writeIndices.size() != destShape.size() ||
+      destShape.size() < vectorShape.size())
+    return SmallVector<bool>(vectorShape.size(), false);
+
+  int64_t rankDiff = destShape.size() - vectorShape.size();
+  SmallVector<int64_t> destDims;
+  SmallVector<Value> alignedIndices;
+  destDims.reserve(vectorShape.size());
+  alignedIndices.reserve(vectorShape.size());
+  for (auto [vecDim, _] : llvm::enumerate(vectorShape)) {
+    int64_t destDim = rankDiff + vecDim;
+    destDims.push_back(destDim);
+    alignedIndices.push_back(writeIndices[destDim]);
+  }
+
+  return computeInBoundsFromStaticShapeAndIndices(
+      destShape, vectorShape, destDims, alignedIndices,
+      /*unknownIndexRequiresExactSize=*/false);
+}
+
 Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
                                      Value source,
                                      ArrayRef<int64_t> inputVectorSizes,
@@ -441,16 +573,13 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc,
          "expected same pad element type to match source element type");
 
   auto zero = arith::ConstantIndexOp::create(builder, loc, 0);
+  SmallVector<Value> indices(vecToReadRank, zero);
   SmallVector<bool> inBoundsVal(vecToReadRank, true);
 
   if (useInBoundsInsteadOfMasking) {
-    // Update the inBounds attribute.
-    // FIXME: This computation is too weak - it ignores the read indices.
-    for (unsigned i = 0; i < vecToReadRank; i++)
-      inBoundsVal[i] = (sourceShape[i] == vecToReadShape[i]) &&
-                       ShapedType::isStatic(sourceShape[i]);
+    inBoundsVal = computeReadInBoundsFromStaticShape(sourceShape,
+                                                     vecToReadShape, indices);
   }
-  SmallVector<Value> indices(vecToReadRank, zero);
   auto transferReadOp =
       vector::TransferReadOp::create(builder, loc,
                                      /*vectorType=*/vecToReadTy,
@@ -491,17 +620,6 @@ Operation *vector::createWriteOrMaskedWrite(OpBuilder &builder, Location loc,
   int64_t vecToStoreRank = vecToStoreType.getRank();
   auto vecToStoreShape = vecToStoreType.getShape();
 
-  // Compute the in_bounds attribute
-  SmallVector<bool> inBoundsVal(vecToStoreRank, true);
-  if (useInBoundsInsteadOfMasking) {
-    // Update the inBounds attribute.
-    // FIXME: This computation is too weak - it ignores the write indices.
-    for (unsigned i = 0; i < vecToStoreRank; i++)
-      inBoundsVal[i] =
-          (destShape[destRank - vecToStoreRank + i] >= vecToStoreShape[i]) &&
-          ShapedType::isStatic(destShape[destRank - vecToStoreRank + i]);
-  }
-
   // If missing, initialize the write indices to 0.
   bool useDefaultWriteIdxs = writeIndices.empty();
   assert((useDefaultWriteIdxs ||
@@ -512,6 +630,13 @@ Operation *vector::createWriteOrMaskedWrite(OpBuilder &builder, Location loc,
     writeIndices.assign(destRank, zero);
   }
 
+  // Compute the in_bounds attribute.
+  SmallVector<bool> inBoundsVal(vecToStoreRank, true);
+  if (useInBoundsInsteadOfMasking) {
+    inBoundsVal = computeWriteInBoundsFromStaticShape(
+        destShape, vecToStoreShape, writeIndices);
+  }
+
   // Generate the xfer_write Op
   Operation *write = vector::TransferWriteOp::create(builder, loc,
                                                      /*vector=*/vecToStore,
diff --git a/mlir/test/Dialect/Linalg/vectorization/insert-slice-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/insert-slice-with-patterns.mlir
index f7764be9be73f..c83856a7c99de 100644
--- a/mlir/test/Dialect/Linalg/vectorization/insert-slice-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/insert-slice-with-patterns.mlir
@@ -59,6 +59,103 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// All source dimensions are dynamic, so read in_bounds cannot be proven for
+// any vector dimension. The transfer_read should therefore print without an
+// in_bounds attribute (all-false canonical form).
+//
+// CHECK-LABEL:   func.func @insert_dynamic_slice_all_false_read_in_bounds(
+// CHECK-SAME:      %[[ARG_0:.*]]: tensor<?x?x?xf32>,
+// CHECK-SAME:      %[[PAD:.*]]: f32, %[[SZ0:.*]]: index, %[[SZ1:.*]]: index, %[[SZ2:.*]]: index) -> tensor<9x8x7x1x2x3xf32> {
+// CHECK:           %[[EMPTY:.*]] = tensor.empty() : tensor<9x8x7x1x2x3xf32>
+// CHECK:           %[[BC:.*]] = vector.broadcast %[[PAD]] : f32 to vector<9x8x7x1x2x3xf32>
+// CHECK:           %[[WRITE:.*]] = vector.transfer_write %[[BC]], %[[EMPTY]]{{.*}} {in_bounds = [true, true, true, true, true, true]} : vector<9x8x7x1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[ARG_0]]{{.*}}, %[[PAD]] : tensor<?x?x?xf32>, vector<1x2x3xf32>
+// CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ]], %[[WRITE]]{{.*}} {in_bounds = [true, true, true]} : vector<1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
+// CHECK:           return %[[RES]] : tensor<9x8x7x1x2x3xf32>
+func.func @insert_dynamic_slice_all_false_read_in_bounds(
+    %arg0: tensor<?x?x?xf32>, %pad : f32,
+    %sz0: index, %sz1: index, %sz2: index) -> tensor<9x8x7x1x2x3xf32> {
+  %init = tensor.empty() : tensor<9x8x7x1x2x3xf32>
+  %fill = linalg.fill ins(%pad : f32) outs(%init : tensor<9x8x7x1x2x3xf32>) -> tensor<9x8x7x1x2x3xf32>
+  %res = tensor.insert_slice %arg0 into %fill[0, 0, 0, 0, 0, 0] [1, 1, 1, %sz0, %sz1, %sz2][1, 1, 1, 1, 1, 1] : tensor<?x?x?xf32> into tensor<9x8x7x1x2x3xf32>
+  return %res : tensor<9x8x7x1x2x3xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// All write offsets in the vectorized dimensions are dynamic, so write
+// in_bounds cannot be proven for any vector dimension. The transfer_write
+// should print without an in_bounds attribute (all-false canonical form).
+//
+// CHECK-LABEL:   func.func @insert_static_slice_all_false_write_in_bounds(
+// CHECK-SAME:      %[[ARG_0:.*]]: tensor<1x2x3xf32>,
+// CHECK-SAME:      %[[PAD:.*]]: f32, %[[OFF0:.*]]: index, %[[OFF1:.*]]: index, %[[OFF2:.*]]: index) -> tensor<9x8x7x1x2x3xf32> {
+// CHECK:           %[[EMPTY:.*]] = tensor.empty() : tensor<9x8x7x1x2x3xf32>
+// CHECK:           %[[BC:.*]] = vector.broadcast %[[PAD]] : f32 to vector<9x8x7x1x2x3xf32>
+// CHECK:           %[[WRITE:.*]] = vector.transfer_write %[[BC]], %[[EMPTY]]{{.*}} {in_bounds = [true, true, true, true, true, true]} : vector<9x8x7x1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[ARG_0]]{{.*}}, %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x2x3xf32>, vector<1x2x3xf32>
+// CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ]], %[[WRITE]]{{.*}} : vector<1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
+// CHECK:           return %[[RES]] : tensor<9x8x7x1x2x3xf32>
+func.func @insert_static_slice_all_false_write_in_bounds(
+    %arg0: tensor<1x2x3xf32>, %pad : f32,
+    %off0: index, %off1: index, %off2: index) -> tensor<9x8x7x1x2x3xf32> {
+  %init = tensor.empty() : tensor<9x8x7x1x2x3xf32>
+  %fill = linalg.fill ins(%pad : f32) outs(%init : tensor<9x8x7x1x2x3xf32>) -> tensor<9x8x7x1x2x3xf32>
+  %res = tensor.insert_slice %arg0 into %fill[0, 0, 0, %off0, %off1, %off2] [1, 1, 1, 1, 2, 3][1, 1, 1, 1, 1, 1] : tensor<1x2x3xf32> into tensor<9x8x7x1x2x3xf32>
+  return %res : tensor<9x8x7x1x2x3xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// Non-zero insert offsets should affect transfer_write in_bounds. Since source
+// shape is dynamic, transfer_read also keeps a conservative in_bounds.
+//
+// CHECK-LABEL:   func.func @insert_dynamic_slice_non_zero_offset(
+// CHECK-SAME:      %[[ARG_0:.*]]: tensor<1x?x3xf32>,
+// CHECK-SAME:      %[[PAD:.*]]: f32,
+// CHECK-SAME:      %[[SIZE:.*]]: index) -> tensor<9x8x7x1x2x3xf32> {
+// CHECK:           %[[EMPTY:.*]] = tensor.empty() : tensor<9x8x7x1x2x3xf32>
+// CHECK:           %[[BC:.*]] = vector.broadcast %[[PAD]] : f32 to vector<9x8x7x1x2x3xf32>
+// CHECK:           %[[WRITE:.*]] = vector.transfer_write %[[BC]], %[[EMPTY]]{{.*}} {in_bounds = [true, true, true, true, true, true]} : vector<9x8x7x1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
+// CHECK:           %[[READ:.*]] = vector.transfer_read %[[ARG_0]]{{.*}}, %[[PAD]] {in_bounds = [true, false, true]} : tensor<1x?x3xf32>, vector<1x2x3xf32>
+// CHECK:           %[[RES:.*]] = vector.transfer_write %[[READ]], %[[WRITE]]{{.*}} {in_bounds = [true, false, true]} : vector<1x2x3xf32>, tensor<9x8x7x1x2x3xf32>
+// CHECK:           return %[[RES]] : tensor<9x8x7x1x2x3xf32>
+func.func @insert_dynamic_slice_non_zero_offset(%arg0: tensor<1x?x3xf32>, %pad : f32, %size: index) -> tensor<9x8x7x1x2x3xf32> {
+  %init = tensor.empty() : tensor<9x8x7x1x2x3xf32>
+  %fill = linalg.fill ins(%pad : f32) outs(%init : tensor<9x8x7x1x2x3xf32>) -> tensor<9x8x7x1x2x3xf32>
+  %res = tensor.insert_slice %arg0 into %fill[0, 0, 0, 0, 1, 0] [1, 1, 1, 1, %size, 3][1, 1, 1, 1, 1, 1] : tensor<1x?x3xf32> into tensor<9x8x7x1x2x3xf32>
+  return %res : tensor<9x8x7x1x2x3xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 // Same as above, but the source type has is dynamically shaped. This means
 // that the pad value is now required and the vector dim corresponding to the
 // dynamic shape has to be inferred from the shape of the destination tensor.
diff --git a/mlir/test/Dialect/Vector/transform-vector.mlir b/mlir/test/Dialect/Vector/transform-vector.mlir
index 4dc11c26e83f1..fc788867fbbd0 100644
--- a/mlir/test/Dialect/Vector/transform-vector.mlir
+++ b/mlir/test/Dialect/Vector/transform-vector.mlir
@@ -69,6 +69,73 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL: func @matmul_tensors_iv_indices
+func.func @matmul_tensors_iv_indices(
+  %arg0: tensor<8x16xf32>, %arg1: tensor<16x64xf32>, %arg2: tensor<8x64xf32>)
+    -> tensor<8x64xf32> {
+// CHECK-NOT: linalg
+// CHECK: vector.extract {{.*}} : vector<4xf32> from vector<8x4xf32>
+// CHECK: vector.store {{.*}} : memref<8x64xf32>, vector<4xf32>
+  %0 = linalg.matmul  ins(%arg0, %arg1: tensor<8x16xf32>, tensor<16x64xf32>)
+                     outs(%arg2: tensor<8x64xf32>)
+    -> tensor<8x64xf32>
+  return %0 : tensor<8x64xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.consumed}) {
+    %0 = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op
+    %1, %loops:3 = transform.structured.tile_using_for %0 tile_sizes [8, 4, 2]
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+    %2 = transform.get_parent_op %1 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize_children_and_apply_patterns %2 : (!transform.any_op) -> !transform.any_op
+    %b = transform.bufferization.one_shot_bufferize
+        layout{IdentityLayoutMap} %module_op
+        {bufferize_function_boundaries = true, allow_return_allocs = true}
+        : (!transform.any_op) -> !transform.any_op
+
+    %f = transform.structured.match ops{["func.func"]} in %b
+      : (!transform.any_op) -> !transform.any_op
+
+    transform.apply_patterns to %f {
+      transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
+    } : !transform.any_op
+
+    transform.apply_patterns to %f {
+      transform.apply_patterns.vector.transfer_permutation_patterns
+    } : !transform.any_op
+
+    transform.apply_patterns to %f {
+      transform.apply_patterns.vector.reorder_multi_reduction_dims lowering_strategy = "innerparallel"
+      transform.apply_patterns.vector.multi_reduction_flattening lowering_strategy = "innerparallel"
+      transform.apply_patterns.vector.multi_reduction_unrolling lowering_strategy = "innerparallel"
+    } : !transform.any_op
+
+    transform.apply_patterns to %f {
+      transform.apply_patterns.vector.split_transfer_full_partial split_transfer_strategy = "linalg-copy"
+    } : !transform.any_op
+
+    transform.apply_patterns to %f {
+      transform.apply_patterns.vector.transfer_to_scf max_transfer_rank = 1 full_unroll = true
+    } : !transform.any_op
+
+    transform.apply_patterns to %f {
+      transform.apply_patterns.vector.lower_transfer max_transfer_rank = 1
+    } : !transform.any_op
+
+    transform.apply_patterns to %f {
+      transform.apply_patterns.vector.lower_shape_cast
+    } : !transform.any_op
+
+    transform.apply_patterns to %f {
+      transform.apply_patterns.vector.lower_transpose lowering_strategy = "shuffle_1d"
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 // CHECK-DAG: #[[$map0:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)>
 // CHECK-DAG: #[[$map1:.*]] = affine_map<(d0, d1, d2) -> (d2, d1)>
 // CHECK-DAG: #[[$map2:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>