[Mlir-commits] [mlir] 7a078b6 - [mlir][linalg] Refine how contiguous loads are identified

Andrzej Warzynski llvmlistbot at llvm.org
Wed Mar 8 00:23:59 PST 2023


Author: Andrzej Warzynski
Date: 2023-03-08T07:49:04Z
New Revision: 7a078b65fb880eb97bb47a26e938cdbd23531688

URL: https://github.com/llvm/llvm-project/commit/7a078b65fb880eb97bb47a26e938cdbd23531688
DIFF: https://github.com/llvm/llvm-project/commit/7a078b65fb880eb97bb47a26e938cdbd23531688.diff

LOG: [mlir][linalg] Refine how contiguous loads are identified

Vectorization of `tensor.extract` using contiguous loads
(`vector.transfer_read`) was introduced in [1]. This patch updates and
refines the existing logic (so that more cases of contiguous can be
identified), as well as adds more tests.

Specifically, contiguous load operations are identified by making sure
that:
  1. non-trailing indices for `tensor.extract` are loop invariant (so,
     e.g., there are no "jumps" from one row to the other between
     iterations),
  2. the trailing index for `tensor.extract` increments by 1 with every
     loop iteration (so that it's always adjacent elements that are
     loaded).
This patch introduces:
  * `isLoopInvariantIdx` for step 1., and
  * `isContiguousLoadIdx` for step 2.
These new methods replace:
  * `isContiguousLoadIdx`, and `isBasedOnIndexOp`.

Both approaches lead to similar end-result (none of the existing tests
required updating). However, with the updated approach, it's much easier
to treat the trailing and non-trailing indices separately and to add
more cases for which contiguous loads can be used.

[1] https://reviews.llvm.org/D141998

Differential Revision: https://reviews.llvm.org/D145385

Added: 
    

Modified: 
    mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
    mlir/test/Dialect/Linalg/vectorization.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 29cabd42d3da9..3db61da84128a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -636,81 +636,112 @@ enum VectorMemoryAccessKind {
   Gather
 };
 
-/// Check whether /p val can be used for calculating an index for a contiguous
-/// load operation. This means that /p val should either:
-///   * be invariant with respect to /p linalgOp, or
-///   * increment by 1 with every loop iterator (when /p shouldBeConstant is
-///     false).
-/// Parameters /p trailingLoopDim and /p shouldBeConstant are used to analyze
-/// `linalg.index` ops.
-static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val,
-                                size_t trailingLoopDim, bool shouldBeConstant) {
-  auto *block = linalgOp.getBlock();
+/// Checks whether /p val can be used for calculating a loop invariant index.
+static bool isLoopInvariantIdx(LinalgOp &linalgOp, Value &val) {
 
-  // Bail out if this is a block argument for this linalg.generic Op.
+  auto targetShape = linalgOp.getStaticLoopRanges();
+  assert(((llvm::count_if(targetShape,
+                          [](int64_t dimSize) { return dimSize > 1; }) == 1)) &&
+         "n-D vectors are not yet supported");
+  assert(targetShape.back() != 1 &&
+         "1-D vectors with the trailing dim eqaual 1 are not yet supported");
+
+  // Blocks outside _this_ linalg.generic are effectively loop invariant.
+  // However, analysing block arguments for _this_ linalg.generic Op is a bit
+  // tricky. Just bail out in the latter case.
   // TODO: We could try analysing the corresponding affine map here.
-  if (val.dyn_cast<BlockArgument>())
+  auto *block = linalgOp.getBlock();
+  if (isa<BlockArgument>(val))
     return llvm::all_of(block->getArguments(),
                         [&val](Value v) { return (v != val); });
 
   Operation *defOp = val.getDefiningOp();
   assert(defOp && "This is neither a block argument nor an operation result");
 
-  // We know that we are reading into a 1-D tensor like this:
-  // `tensor<1x1x4xi32`. Given this assumption, the following Op:
-  //    * `%idx = `linalg.index dim : index`,
-  // will either:
-  //    1. produce a constant when `dim` _is not_ the trailing loop dim, or
-  //    2. increment with stride one when `dim` _is_ the trailing loop dim.
+  // IndexOp is loop invariant as long as its result remains constant across
+  // iterations. Given the assumptions on the loop ranges above, only the
+  // trailing loop dim ever changes.
+  auto trailingLoopDim = linalgOp.getStaticLoopRanges().size() - 1;
   if (auto indexOp = dyn_cast<linalg::IndexOp>(defOp))
-    return shouldBeConstant ? (indexOp.getDim() != trailingLoopDim)
-                            : (indexOp.getDim() == trailingLoopDim);
+    return (indexOp.getDim() != trailingLoopDim);
 
   auto *ancestor = block->findAncestorOpInBlock(*defOp);
 
-  // Values define outside `linalgOp`.
+  // Values define outside `linalgOp` are loop invariant.
   if (!ancestor)
     return true;
 
-  // Values defined inside `linalgOp`, which are constant.
-  if (dyn_cast<arith::ConstantOp>(ancestor))
+  // Values defined inside `linalgOp`, which are constant, are loop invariant.
+  if (isa<arith::ConstantOp>(ancestor))
     return true;
 
-  // Conservatively reject Ops that could lead to non-contiguous accesses.
-  if (!isa<arith::AddIOp, arith::SubIOp, linalg::IndexOp>(ancestor))
-    return false;
-
   bool result = true;
   for (auto op : ancestor->getOperands())
-    result &=
-        isContiguousLoadIdx(linalgOp, op, trailingLoopDim, shouldBeConstant);
+    result &= isLoopInvariantIdx(linalgOp, op);
 
   return result;
 }
 
-/// Check whether the calculation of \p val is based on linalg.index Op with
-/// the dim attribute matching \p dim.
-static bool isBasedOnIndexOp(LinalgOp &linalgOp, Value &val, size_t dim) {
-  auto *block = linalgOp.getBlock();
-  auto targetShape = linalgOp.getStaticLoopRanges();
+/// Check whether \p val could be used for calculating the trailing index for a
+/// contiguous load operation.
+///
+/// There are currently 3 types of values that are allowed here:
+///   1. loop-invariant values,
+///   2. values that increment by 1 with every loop iteration,
+///   3. results of basic arithmetic operations (linear and continuous)
+///      involving 1., 2. and 3.
+/// This method returns True if indeed only such values are used in calculating
+/// \p val.
+///
+/// Additionally, the trailing index for a contiguous load operation should
+/// increment by 1 with every loop iteration, i.e. be based on:
+///   * `linalg.index <dim>` ,
+/// where <dim> is the trailing dim of the iteration space. \p foundIndexOp is
+/// updated to `true` when such an op is found.
+static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val,
+                                bool &foundIndexOp) {
 
-  if (val.isa<BlockArgument>())
-    return false;
+  auto targetShape = linalgOp.getStaticLoopRanges();
+  assert(((llvm::count_if(targetShape,
+                          [](int64_t dimSize) { return dimSize > 1; }) == 1)) &&
+         "n-D vectors are not yet supported");
+  assert(targetShape.back() != 1 &&
+         "1-D vectors with the trailing dim 1 are not yet supported");
+
+  // Blocks outside _this_ linalg.generic are effectively loop invariant.
+  // However, analysing block arguments for _this_ linalg.generic Op is a bit
+  // tricky. Just bail out in the latter case.
+  // TODO: We could try analysing the corresponding affine map here.
+  auto *block = linalgOp.getBlock();
+  if (isa<BlockArgument>(val))
+    return llvm::all_of(block->getArguments(),
+                        [&val](Value v) { return (v != val); });
 
   Operation *defOp = val.getDefiningOp();
   assert(defOp && "This is neither a block argument nor an operation result");
 
-  if (auto indexOp = dyn_cast<linalg::IndexOp>(defOp))
-    return (indexOp.getDim() == dim);
+  // Given the assumption on the loop ranges above, only the trailing loop
+  // index is not constant.
+  auto trailingLoopDim = linalgOp.getStaticLoopRanges().size() - 1;
+  if (auto indexOp = dyn_cast<linalg::IndexOp>(defOp)) {
+    foundIndexOp = (indexOp.getDim() == trailingLoopDim);
+    return true;
+  }
 
   auto *ancestor = block->findAncestorOpInBlock(*defOp);
 
   if (!ancestor)
     return false;
 
+  // Conservatively reject Ops that could lead to indices with stride other
+  // than 1.
+  if (!isa<arith::AddIOp, arith::SubIOp, arith::ConstantOp, linalg::IndexOp>(
+          ancestor))
+    return false;
+
   bool result = false;
   for (auto op : ancestor->getOperands())
-    result |= isBasedOnIndexOp(linalgOp, op, dim);
+    result |= isContiguousLoadIdx(linalgOp, op, foundIndexOp);
 
   return result;
 }
@@ -725,7 +756,7 @@ getTensorExtractMemoryAccessPattern(tensor::ExtractOp extractOp,
 
   auto targetShape = linalgOp.getStaticLoopRanges();
 
-  // Assume that it's a gather load when reading _into_:
+  // 1. Assume that it's a gather load when reading _into_:
   //    * an n-D vector, like`tensor<1x2x4xi32` or`tensor<2x1x4xi32>`, or
   //    * a 1-D vector with the trailing dim equal 1, e.g. `tensor<1x4x1xi32`.
   // TODO: Relax these conditions.
@@ -736,44 +767,36 @@ getTensorExtractMemoryAccessPattern(tensor::ExtractOp extractOp,
 
   auto inputShape = extractOp.getTensor().getType().cast<ShapedType>();
 
-  // Assume that it's a gather load when reading _from_ a tensor for which the
-  // trailing dimension is 1, e.g. `tensor<1x4x1xi32>`.
+  // 2. Assume that it's a gather load when reading _from_ a tensor for which
+  // the trailing dimension is 1, e.g. `tensor<1x4x1xi32>`.
   // TODO: Relax this condition.
   if (inputShape.getShape().back() == 1)
     return VectorMemoryAccessKind::Gather;
 
-  // The trailing loop dim is needed when analyzing ops like:
-  //     * %idx = `linalg.index <dim> : index`.
-  auto trailingLoopDim = targetShape.size() - 1;
-
   bool isContiguous = true;
 
-  // Iterate over all indices. Analyze the way each index is calculated and
-  // decide whether it is suitable for a contiguous load (e.g. loop invariant).
+  // 3a. Analyze the leading indices of `extractOp`.
+  // Look at the way each index is calculated and decide whether it is suitable
+  // for a contiguous load, i.e. whether it's loop invariant.
   auto indices = extractOp.getIndices();
-  for (auto [i, indexVal] : llvm::enumerate(indices)) {
-    if (inputShape.getShape()[i] == 1) {
-      // This index will always be equal 0, so it is a loop-invariant constant.
-      continue;
-    }
+  auto leadIndices = ValueRange(indices.drop_back(1));
 
-    // Should this index be loop invariant?
-    //  * _no_ if this is the trailing index,
-    //  * _yes_ otherwise.
-    auto extractOpBottomIdx = indices.size() - 1;
-    bool loopInvariantIndex = (i != extractOpBottomIdx);
+  for (auto [i, indexVal] : llvm::enumerate(leadIndices)) {
+    if (inputShape.getShape()[i] == 1)
+      continue;
 
-    isContiguous &= isContiguousLoadIdx(linalgOp, indexVal, trailingLoopDim,
-                                        loopInvariantIndex);
+    isContiguous &= isLoopInvariantIdx(linalgOp, indexVal);
   }
 
-  // The trailing index in the extract Op must increment with every iteration,
-  // which means that it must be based on a loop index. Given the assumption
-  // on the output tensor, only the trailing loop index is not constant, so
-  // that's what we need to check against.
+  // 3b. Analyze the trailing index for `extractOp`.
   auto extractOpTrailingIdx = indices.back();
+  // For contiguous loads, the trailing `extractOp` index should increment with
+  // every loop iteration. This effectively means that it must be based on the
+  // trailing loop index. This is what the following bool captures.
+  bool foundIndexOp = false;
   isContiguous &=
-      isBasedOnIndexOp(linalgOp, extractOpTrailingIdx, trailingLoopDim);
+      isContiguousLoadIdx(linalgOp, extractOpTrailingIdx, foundIndexOp);
+  isContiguous &= foundIndexOp;
 
   if (isContiguous) {
     LDBG("Found contigous load: " << extractOp);

diff  --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
index 41e52909fffd9..26e27c108ce81 100644
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -1783,6 +1783,211 @@ func.func @vectorize_nd_tensor_extract_contiguous_and_gather(%arg0: tensor<6xf32
 // CHECK:           %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_8]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32>
 // CHECK:           return %[[VAL_14]] : tensor<5xf32>
 
+transform.sequence failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+   %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+   %2 = transform.structured.vectorize %1 { vectorize_nd_extract }
+ }
+
+// -----
+
+// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Contiguous load.
+func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
+  %c79 = arith.constant 79 : index
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%extracted_slice : tensor<1x4xf32>) {
+  ^bb0(%out: f32):
+    %2 = linalg.index 1 : index
+    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0)
+    %extracted = tensor.extract %6[%c79, %3] : tensor<80x16xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x4xf32>
+  return %1 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_affine_apply_contiguous(
+// CHECK-SAME:                                                                        %[[VAL_0:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:                                                                        %[[VAL_1:.*]]: index,
+// CHECK-SAME:                                                                        %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
+// CHECK:           %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 79 : index
+// CHECK:           %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex>
+// CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex>
+// CHECK:           %[[VAL_10:.*]] = vector.extractelement %[[VAL_9]]{{\[}}%[[VAL_4]] : i32] : vector<4xindex>
+// CHECK:           %[[VAL_11:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_10]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32>
+// CHECK:           %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+// CHECK:           return %[[VAL_12]] : tensor<1x4xf32>
+// CHECK:         }
+
+transform.sequence failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+   %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+   %2 = transform.structured.vectorize %1 { vectorize_nd_extract }
+ }
+
+// -----
+
+// The vectorizer converts `affine.apply` so that the subsequent Ops can be vectorised based on the converted ops. Gather load.
+func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(%6: tensor<80x16xf32>, %arg0: index, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
+  %c16 = arith.constant 16 : index
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%extracted_slice : tensor<1x4xf32>) {
+  ^bb0(%out: f32):
+    %2 = linalg.index 1 : index
+    %3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)>(%2, %arg0)
+    %extracted = tensor.extract %6[%3, %c16] : tensor<80x16xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x4xf32>
+  return %1 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_affine_apply_gather(
+// CHECK-SAME:                                                                    %[[VAL_0:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:                                                                    %[[VAL_1:.*]]: index,
+// CHECK-SAME:                                                                    %[[VAL_2:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
+// CHECK:           %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK:           %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1>
+// CHECK:           %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32>
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant dense<16> : vector<1x4xindex>
+// CHECK:           %[[VAL_8:.*]] = vector.broadcast %[[VAL_1]] : index to vector<4xindex>
+// CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_3]] : vector<4xindex>
+// CHECK:           %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : vector<4xindex> to vector<1x4xindex>
+// CHECK:           %[[VAL_11:.*]] = arith.muli %[[VAL_10]], %[[VAL_7]] : vector<1x4xindex>
+// CHECK:           %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_7]] : vector<1x4xindex>
+// CHECK:           %[[VAL_13:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {{\[}}%[[VAL_12]]], %[[VAL_4]], %[[VAL_5]] : tensor<80x16xf32>, vector<1x4xindex>, vector<1x4xi1>, vector<1x4xf32> into vector<1x4xf32>
+// CHECK:           %[[VAL_14:.*]] = vector.transfer_write %[[VAL_13]], %[[VAL_2]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+// CHECK:           return %[[VAL_14]] : tensor<1x4xf32>
+// CHECK:         }
+
+transform.sequence failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+   %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+   %2 = transform.structured.vectorize %1 { vectorize_nd_extract }
+ }
+
+// -----
+
+// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Gather load.
+func.func @vectorize_nd_tensor_extract_with_maxsi_gather(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
+  %c79 = arith.constant 79 : index
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%extracted_slice : tensor<1x4xf32>) {
+  ^bb0(%out: f32):
+    %2 = linalg.index 1 : index
+    %3 = arith.maxsi %2, %c79 : index
+    %extracted = tensor.extract %arg0[%3, %2] : tensor<80x16xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x4xf32>
+  return %1 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_maxsi_gather(
+// CHECK-SAME:                                                             %[[VAL_0:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:                                                             %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex>
+// CHECK:           %[[VAL_3:.*]] = arith.constant dense<1264> : vector<1x4xindex>
+// CHECK:           %[[VAL_4:.*]] = arith.constant dense<true> : vector<1x4xi1>
+// CHECK:           %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<1x4xf32>
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = vector.broadcast %[[VAL_2]] : vector<4xindex> to vector<1x4xindex>
+// CHECK:           %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_3]] : vector<1x4xindex>
+// CHECK:           %[[VAL_9:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {{\[}}%[[VAL_8]]], %[[VAL_4]], %[[VAL_5]] : tensor<80x16xf32>, vector<1x4xindex>, vector<1x4xi1>, vector<1x4xf32> into vector<1x4xf32>
+// CHECK:           %[[VAL_10:.*]] = vector.transfer_write %[[VAL_9]], %[[VAL_1]]{{\[}}%[[VAL_6]], %[[VAL_6]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+// CHECK:           return %[[VAL_10]] : tensor<1x4xf32>
+// CHECK:         }
+
+transform.sequence failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+   %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+   %2 = transform.structured.vectorize %1 { vectorize_nd_extract }
+ }
+
+// -----
+
+// Make sure that non-linear arithmetic operations (e.g. arith.maxsi) are allowed when calculating indices for load operations. Contiguous load.
+func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(%arg0: tensor<80x16xf32>, %extracted_slice : tensor<1x4xf32>) -> tensor<1x4xf32> {
+  %c16 = arith.constant 16 : index
+  %1 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%extracted_slice : tensor<1x4xf32>) {
+  ^bb0(%out: f32):
+    %2 = linalg.index 0 : index
+    %3 = linalg.index 1 : index
+    %4 = arith.maxsi %2, %c16 : index
+    %extracted = tensor.extract %arg0[%4, %3] : tensor<80x16xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x4xf32>
+  return %1 : tensor<1x4xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_with_maxsi_contiguous(
+// CHECK-SAME:                                                                 %[[VAL_0:.*]]: tensor<80x16xf32>,
+// CHECK-SAME:                                                                 %[[VAL_1:.*]]: tensor<1x4xf32>) -> tensor<1x4xf32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant dense<16> : vector<1x4xindex>
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[VAL_6:.*]] = vector.shape_cast %[[VAL_2]] : vector<1x4xindex> to vector<4xindex>
+// CHECK:           %[[VAL_7:.*]] = vector.extractelement %[[VAL_6]]{{\[}}%[[VAL_3]] : i32] : vector<4xindex>
+// CHECK:           %[[VAL_8:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_7]], %[[VAL_4]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<80x16xf32>, vector<1x4xf32>
+// CHECK:           %[[VAL_9:.*]] = vector.transfer_write %[[VAL_8]], %[[VAL_1]]{{\[}}%[[VAL_4]], %[[VAL_4]]] {in_bounds = [true, true]} : vector<1x4xf32>, tensor<1x4xf32>
+// CHECK:           return %[[VAL_9]] : tensor<1x4xf32>
+// CHECK:         }
+
+transform.sequence failures(propagate) {
+ ^bb1(%arg1: !pdl.operation):
+   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+   %1 = get_closest_isolated_parent %0 : (!pdl.operation) -> !pdl.operation
+   %2 = transform.structured.vectorize %1 { vectorize_nd_extract }
+ }
+
+// -----
+
+// The vectorizer assumes it's a gather load whenever using a block argument to calculate an index.
+#map = affine_map<(d0) -> (d0)>
+func.func @vectorize_nd_tensor_extract_block_arg(%arg0: tensor<5x6xf32>, %arg1: tensor<5xindex>) -> tensor<5xf32> {
+ %0 = tensor.empty() : tensor<5xf32>
+ %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg1: tensor<5xindex>) outs(%0 : tensor<5xf32>) {
+ ^bb0(%in: index, %out: f32):
+   %2 = linalg.index 0 : index
+   %extracted_0 = tensor.extract %arg0[%in, %2] : tensor<5x6xf32>
+   linalg.yield %extracted_0 : f32
+ } -> tensor<5xf32>
+ return %1 : tensor<5xf32>
+}
+
+// CHECK-LABEL:   func.func @vectorize_nd_tensor_extract_block_arg(
+// CHECK-SAME:                                                     %[[VAL_0:.*]]: tensor<5x6xf32>,
+// CHECK-SAME:                                                     %[[VAL_1:.*]]: tensor<5xindex>) -> tensor<5xf32> {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant dense<[0, 1, 2, 3, 4]> : vector<5xindex>
+// CHECK:           %[[VAL_4:.*]] = arith.constant dense<true> : vector<5xi1>
+// CHECK:           %[[VAL_5:.*]] = arith.constant dense<0.000000e+00> : vector<5xf32>
+// CHECK:           %[[VAL_6:.*]] = arith.constant dense<6> : vector<5xindex>
+// CHECK:           %[[VAL_7:.*]] = tensor.empty() : tensor<5xf32>
+// CHECK:           %[[VAL_8:.*]] = vector.transfer_read %[[VAL_1]]{{\[}}%[[VAL_2]]], %[[VAL_2]] {in_bounds = [true]} : tensor<5xindex>, vector<5xindex>
+// CHECK:           %[[VAL_9:.*]] = arith.muli %[[VAL_8]], %[[VAL_6]] : vector<5xindex>
+// CHECK:           %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_3]] : vector<5xindex>
+// CHECK:           %[[VAL_11:.*]] = vector.gather %[[VAL_0]]{{\[}}%[[VAL_2]], %[[VAL_2]]] {{\[}}%[[VAL_10]]], %[[VAL_4]], %[[VAL_5]] : tensor<5x6xf32>, vector<5xindex>, vector<5xi1>, vector<5xf32> into vector<5xf32>
+// CHECK:           %[[VAL_12:.*]] = vector.transfer_write %[[VAL_11]], %[[VAL_7]]{{\[}}%[[VAL_2]]] {in_bounds = [true]} : vector<5xf32>, tensor<5xf32>
+// CHECK:           return %[[VAL_12]] : tensor<5xf32>
+// CHECK:         }
+
 transform.sequence failures(propagate) {
  ^bb1(%arg1: !pdl.operation):
    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation


        


More information about the Mlir-commits mailing list