[Mlir-commits] [llvm] [mlir] [mlir][vector] Add mask elimination transform (PR #99314)

Tue Jul 23 06:53:51 PDT 2024

https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/99314

>From 09b0fcd7275fccb385cf273ff8f8ca0850551b2e Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 24 Jun 2024 14:42:18 +0000
Subject: [PATCH 1/8] [mlir][vector] Add mask elimination transform

This adds a new transform `eliminateVectorMasks()` which aims at
removing scalable `vector.create_masks` that will be all-true at
runtime. It attempts to do this by simply pattern-matching the mask
operands (similar to some canonicalizations), if that does not lead to
an answer (is all-true? yes/no), then value bounds analysis will be used
to find the lower bound of the unknown operands. If the lower bound is
>= to the corresponding mask vector type dim, then that dimension of the
mask is all true.

Note: Eliminating create_masks here means replacing them with all-true
constants (which will then lead to the masks folding away).
---
 .../Vector/Transforms/VectorTransforms.h      |  17 +++
 .../Dialect/Vector/Transforms/CMakeLists.txt  |   1 +
 .../Transforms/VectorMaskElimination.cpp      | 117 +++++++++++++++
 mlir/test/Dialect/Vector/eliminate-masks.mlir | 138 ++++++++++++++++++
 .../Dialect/Vector/TestVectorTransforms.cpp   |  34 +++++
 5 files changed, 307 insertions(+)
 create mode 100644 mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
 create mode 100644 mlir/test/Dialect/Vector/eliminate-masks.mlir

diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
index 1f7d6411cd5a4..847f333d6a931 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
@@ -11,6 +11,7 @@
 
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 
 namespace mlir {
 class MLIRContext;
@@ -115,6 +116,22 @@ castAwayContractionLeadingOneDim(vector::ContractionOp contractOp,
                                  MaskingOpInterface maskingOp,
                                  RewriterBase &rewriter);
 
+/// Structure to hold the range [vscaleMin, vscaleMax] `vector.vscale` can take.
+struct VscaleRange {
+  unsigned vscaleMin;
+  unsigned vscaleMax;
+};
+
+/// Attempts to eliminate redundant vector masks by replacing them with all-true
+/// constants at the top of the function (which results in the masks folding
+/// away). Note: Currently, this only runs for vector.create_mask ops and
+/// requires `vscaleRange`. If `vscaleRange` is not provided this transform does
+/// nothing. This is because these redundant masks are much more likely for
+/// scalable code which requires memref/tensor dynamic sizes, whereas fixed-size
+/// code has static sizes, so simpler folds remove the masks.
+void eliminateVectorMasks(IRRewriter &rewriter, FunctionOpInterface function,
+                          std::optional<VscaleRange> vscaleRange = {});
+
 } // namespace vector
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
index 723b2f62d65d4..2639a67e1c8b3 100644
--- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -22,6 +22,7 @@ add_mlir_dialect_library(MLIRVectorTransforms
   VectorTransferSplitRewritePatterns.cpp
   VectorTransforms.cpp
   VectorUnroll.cpp
+  VectorMaskElimination.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Vector/Transforms
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
new file mode 100644
index 0000000000000..abec8c75b8fc9
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
@@ -0,0 +1,117 @@
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h"
+#include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
+#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+
+using namespace mlir;
+using namespace mlir::vector;
+namespace {
+
+/// If `value` is a constant multiple of `vector.vscale` return the multiplier.
+std::optional<int64_t> getConstantVscaleMultiplier(Value value) {
+  if (value.getDefiningOp<vector::VectorScaleOp>())
+    return 1;
+  auto mul = value.getDefiningOp<arith::MulIOp>();
+  if (!mul)
+    return {};
+  auto lhs = mul.getLhs();
+  auto rhs = mul.getRhs();
+  if (lhs.getDefiningOp<vector::VectorScaleOp>())
+    return getConstantIntValue(rhs);
+  if (rhs.getDefiningOp<vector::VectorScaleOp>())
+    return getConstantIntValue(lhs);
+  return {};
+}
+
+/// Attempts to resolve a (scalable) CreateMaskOp to an all-true constant mask.
+/// All-true masks can then be eliminated by simple folds.
+LogicalResult resolveAllTrueCreateMaskOp(IRRewriter &rewriter,
+                                         vector::CreateMaskOp createMaskOp,
+                                         VscaleRange vscaleRange) {
+  auto maskType = createMaskOp.getVectorType();
+  auto maskTypeDimScalableFlags = maskType.getScalableDims();
+  auto maskTypeDimSizes = maskType.getShape();
+
+  struct UnknownMaskDim {
+    size_t position;
+    Value dimSize;
+  };
+
+  // Check for any dims that could be (partially) false before doing the more
+  // expensive value bounds computations.
+  SmallVector<UnknownMaskDim> unknownDims;
+  for (auto [i, dimSize] : llvm::enumerate(createMaskOp.getOperands())) {
+    if (auto intSize = getConstantIntValue(dimSize)) {
+      // Mask not all-true for this dim.
+      if (maskTypeDimScalableFlags[i] || intSize < maskTypeDimSizes[i])
+        return failure();
+    } else if (auto vscaleMultiplier = getConstantVscaleMultiplier(dimSize)) {
+      // Mask not all-true for this dim.
+      if (vscaleMultiplier < maskTypeDimSizes[i])
+        return failure();
+    } else {
+      // Unknown (without further analysis).
+      unknownDims.push_back(UnknownMaskDim{i, dimSize});
+    }
+  }
+
+  for (auto [i, dimSize] : unknownDims) {
+    // Compute the lower bound for the unknown dimension (i.e. the smallest
+    // value it could be).
+    auto lowerBound =
+        vector::ScalableValueBoundsConstraintSet::computeScalableBound(
+            dimSize, {}, vscaleRange.vscaleMin, vscaleRange.vscaleMax,
+            presburger::BoundType::LB);
+    if (failed(lowerBound))
+      return failure();
+    auto boundSize = lowerBound->getSize();
+    if (failed(boundSize))
+      return failure();
+    if (boundSize->scalable) {
+      // If the lower bound is scalable and >= to the mask dim size then this
+      // dim is all-true.
+      if (boundSize->baseSize < maskTypeDimSizes[i])
+        return failure();
+    } else {
+      // If the lower bound is a constant and >= to the _fixed-size_ mask dim
+      // size then this dim is all-true.
+      if (maskTypeDimScalableFlags[i])
+        return failure();
+      if (boundSize->baseSize < maskTypeDimSizes[i])
+        return failure();
+    }
+  }
+
+  // Replace createMaskOp with an all-true constant. This should result in the
+  // mask being removed in most cases (as xfer ops + vector.mask have folds to
+  // remove all-true masks).
+  auto allTrue = rewriter.create<arith::ConstantOp>(
+      createMaskOp.getLoc(), maskType, DenseElementsAttr::get(maskType, true));
+  rewriter.replaceAllUsesWith(createMaskOp, allTrue);
+  return success();
+}
+
+} // namespace
+
+namespace mlir::vector {
+
+void eliminateVectorMasks(IRRewriter &rewriter, FunctionOpInterface function,
+                          std::optional<VscaleRange> vscaleRange) {
+  // TODO: Support fixed-size case. This is less likely to be useful as for
+  // fixed-size code dimensions are all static so masks tend to fold away.
+  if (!vscaleRange)
+    return;
+
+  OpBuilder::InsertionGuard g(rewriter);
+  SmallVector<vector::CreateMaskOp> worklist;
+  function.walk([&](vector::CreateMaskOp createMaskOp) {
+    worklist.push_back(createMaskOp);
+  });
+  rewriter.setInsertionPointToStart(&function.front());
+  for (auto mask : worklist)
+    (void)resolveAllTrueCreateMaskOp(rewriter, mask, *vscaleRange);
+}
+
+} // namespace mlir::vector
diff --git a/mlir/test/Dialect/Vector/eliminate-masks.mlir b/mlir/test/Dialect/Vector/eliminate-masks.mlir
new file mode 100644
index 0000000000000..99c9a60a09fac
--- /dev/null
+++ b/mlir/test/Dialect/Vector/eliminate-masks.mlir
@@ -0,0 +1,138 @@
+// RUN: mlir-opt %s -split-input-file -test-eliminate-vector-masks  | FileCheck %s
+
+// This tests a general pattern the vectorizer tends to emit.
+
+// CHECK-LABEL: @eliminate_redundant_masks_through_insert_and_extracts
+// CHECK: %[[ALL_TRUE_MASK:.*]] = arith.constant dense<true> : vector<[4]xi1>
+// CHECK: vector.transfer_read {{.*}} %[[ALL_TRUE_MASK]]
+// CHECK: vector.transfer_write {{.*}} %[[ALL_TRUE_MASK]]
+func.func @eliminate_redundant_masks_through_insert_and_extracts(%tensor: tensor<1x1000xf32>) {
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %c1000 = arith.constant 1000 : index
+  %c0_f32 = arith.constant 0.0 : f32
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  %extracted_slice_0 = tensor.extract_slice %tensor[0, 0] [1, %c4_vscale] [1, 1] : tensor<1x1000xf32> to tensor<1x?xf32>
+  %output_tensor = scf.for %i = %c0 to %c1000 step %c4_vscale iter_args(%arg = %extracted_slice_0) -> tensor<1x?xf32> {
+    // 1. Extract a slice.
+    %extracted_slice_1 = tensor.extract_slice %arg[0, 0] [1, %c4_vscale] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
+
+    // 2. Create a mask for the slice.
+    %dim_1 = tensor.dim %extracted_slice_1, %c0 : tensor<?xf32>
+    %mask = vector.create_mask %dim_1 : vector<[4]xi1>
+
+    // 3. Read the slice and do some computation.
+    %vec = vector.transfer_read %extracted_slice_1[%c0], %c0_f32, %mask {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32>
+    %new_vec = "test.some_computation"(%vec) : (vector<[4]xf32>) -> (vector<[4]xf32>)
+
+    // 4. Write the new value.
+    %write = vector.transfer_write %new_vec, %extracted_slice_1[%c0], %mask {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32>
+
+    // 5. Insert and yield the new tensor value.
+    %result = tensor.insert_slice %write into %arg[0, 0] [1, %c4_vscale] [1, 1] : tensor<?xf32> into tensor<1x?xf32>
+    scf.yield %result : tensor<1x?xf32>
+  }
+  "test.some_use"(%output_tensor) : (tensor<1x?xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @negative_extract_slice_size_shrink
+// CHECK-NOT: arith.constant dense<true> : vector<[4]xi1>
+// CHECK: %[[MASK:.*]] = vector.create_mask
+// CHECK: "test.some_use"(%[[MASK]]) : (vector<[4]xi1>) -> ()
+func.func @negative_extract_slice_size_shrink(%tensor: tensor<1000xf32>) {
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %c1000 = arith.constant 1000 : index
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  %extracted_slice = tensor.extract_slice %tensor[0] [%c4_vscale] [1] : tensor<1000xf32> to tensor<?xf32>
+  %slice = scf.for %i = %c0 to %c1000 step %c4_vscale iter_args(%arg = %extracted_slice) -> tensor<?xf32> {
+    // This mask cannot be eliminated even though looking at the above operations
+    // it appears `tensor.dim` will always be c4_vscale (so the mask all-true).
+    %dim = tensor.dim %arg, %c0 : tensor<?xf32>
+    %mask = vector.create_mask %dim : vector<[4]xi1>
+    "test.some_use"(%mask) : (vector<[4]xi1>) -> ()
+    // !!! Here the size of the mask could shrink in the next iteration.
+    %next_num_els = affine.min  affine_map<(d0)[s0] -> (-d0 + 1000, s0)>(%i)[%c4_vscale]
+    %new_extracted_slice = tensor.extract_slice %tensor[%c4_vscale] [%next_num_els] [1] : tensor<1000xf32> to tensor<?xf32>
+    scf.yield %new_extracted_slice : tensor<?xf32>
+  }
+  "test.some_use"(%slice) : (tensor<?xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @negative_constant_dim_not_all_true
+// CHECK-NOT: arith.constant dense<true> : vector<2x[4]xi1>
+// CHECK: %[[MASK:.*]] = vector.create_mask
+// CHECK: "test.some_use"(%[[MASK]]) : (vector<2x[4]xi1>) -> ()
+func.func @negative_constant_dim_not_all_true()
+{
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  %mask = vector.create_mask %c1, %c4_vscale : vector<2x[4]xi1>
+  "test.some_use"(%mask) : (vector<2x[4]xi1>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @negative_constant_vscale_multiple_not_all_true
+// CHECK-NOT: arith.constant dense<true> : vector<2x[4]xi1>
+// CHECK: %[[MASK:.*]] = vector.create_mask
+// CHECK: "test.some_use"(%[[MASK]]) : (vector<2x[4]xi1>) -> ()
+func.func @negative_constant_vscale_multiple_not_all_true() {
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %vscale = vector.vscale
+  %c3_vscale = arith.muli %vscale, %c3 : index
+  %mask = vector.create_mask %c2, %c3_vscale : vector<2x[4]xi1>
+  "test.some_use"(%mask) : (vector<2x[4]xi1>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @negative_value_bounds_fixed_dim_not_all_true
+// CHECK-NOT: arith.constant dense<true> : vector<3x[4]xi1>
+// CHECK: %[[MASK:.*]] = vector.create_mask
+// CHECK: "test.some_use"(%[[MASK]]) : (vector<3x[4]xi1>) -> ()
+func.func @negative_value_bounds_fixed_dim_not_all_true(%tensor: tensor<2x?xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  // This is _very_ simple but since addi is not a constant value bounds will
+  // be used to resolve it.
+  %dim = tensor.dim %tensor, %c0 : tensor<2x?xf32>
+  %mask = vector.create_mask %dim, %c4_vscale : vector<3x[4]xi1>
+  "test.some_use"(%mask) : (vector<3x[4]xi1>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @negative_value_bounds_scalable_dim_not_all_true
+// CHECK-NOT: arith.constant dense<true> : vector<3x[4]xi1>
+// CHECK: %[[MASK:.*]] = vector.create_mask
+// CHECK: "test.some_use"(%[[MASK]]) : (vector<3x[4]xi1>) -> ()
+func.func @negative_value_bounds_scalable_dim_not_all_true(%tensor: tensor<2x100xf32>) {
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %vscale = vector.vscale
+  %c3_vscale = arith.muli %vscale, %c3 : index
+  %slice = tensor.extract_slice %tensor[0, 0] [2, %c3_vscale] [1, 1] : tensor<2x100xf32> to tensor<2x?xf32>
+  // Another simple example, but value bounds will be used to resolve the tensor.dim.
+  %dim = tensor.dim %slice, %c1 : tensor<2x?xf32>
+  %mask = vector.create_mask %c3, %dim : vector<3x[4]xi1>
+  "test.some_use"(%mask) : (vector<3x[4]xi1>) -> ()
+  return
+}
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index c978699e179fc..f74ff2725f815 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -874,6 +874,38 @@ struct TestVectorLinearize final
       return signalPassFailure();
   }
 };
+
+struct TestEliminateVectorMasks
+    : public PassWrapper<TestEliminateVectorMasks,
+                         OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestEliminateVectorMasks)
+
+  TestEliminateVectorMasks() = default;
+  TestEliminateVectorMasks(const TestEliminateVectorMasks &pass)
+      : PassWrapper(pass) {}
+
+  Option<unsigned> vscaleMin{
+      *this, "vscale-min",
+      llvm::cl::desc(
+          "Minimum value `vector.vscale` can possibly be at runtime."),
+      llvm::cl::init(1)};
+
+  Option<unsigned> vscaleMax{
+      *this, "vscale-max",
+      llvm::cl::desc(
+          "Maximum value `vector.vscale` can possibly be at runtime."),
+      llvm::cl::init(16)};
+
+  StringRef getArgument() const final { return "test-eliminate-vector-masks"; }
+  StringRef getDescription() const final {
+    return "Test eliminating vector masks";
+  }
+  void runOnOperation() override {
+    IRRewriter rewriter(&getContext());
+    eliminateVectorMasks(rewriter, getOperation(),
+                         VscaleRange{vscaleMin, vscaleMax});
+  }
+};
 } // namespace
 
 namespace mlir {
@@ -920,6 +952,8 @@ void registerTestVectorLowerings() {
   PassRegistration<TestVectorEmulateMaskedLoadStore>();
 
   PassRegistration<TestVectorLinearize>();
+
+  PassRegistration<TestEliminateVectorMasks>();
 }
 } // namespace test
 } // namespace mlir

>From 7af1229b9f2cfab5d5437bf34bbbb272dd9108b6 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Fri, 19 Jul 2024 09:59:00 +0000
Subject: [PATCH 2/8] Fixups

---
 .../Dialect/Vector/Transforms/VectorTransforms.h     |  2 +-
 .../Vector/Transforms/VectorMaskElimination.cpp      | 12 ++++++++++++
 mlir/test/Dialect/Vector/eliminate-masks.mlir        | 12 ++++++------
 .../test/lib/Dialect/Vector/TestVectorTransforms.cpp |  9 ++-------
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
index 847f333d6a931..e815e026305fa 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransforms.h
@@ -116,7 +116,7 @@ castAwayContractionLeadingOneDim(vector::ContractionOp contractOp,
                                  MaskingOpInterface maskingOp,
                                  RewriterBase &rewriter);
 
-/// Structure to hold the range [vscaleMin, vscaleMax] `vector.vscale` can take.
+// Structure to hold the range of `vector.vscale`.
 struct VscaleRange {
   unsigned vscaleMin;
   unsigned vscaleMax;
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
index abec8c75b8fc9..486784a9cf102 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
@@ -1,3 +1,11 @@
+//===- VectorMaskElimination.cpp - Eliminate Vector Masks -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h"
@@ -105,10 +113,14 @@ void eliminateVectorMasks(IRRewriter &rewriter, FunctionOpInterface function,
     return;
 
   OpBuilder::InsertionGuard g(rewriter);
+
+  // Build worklist so we can safely insert new ops in
+  // `resolveAllTrueCreateMaskOp()`.
   SmallVector<vector::CreateMaskOp> worklist;
   function.walk([&](vector::CreateMaskOp createMaskOp) {
     worklist.push_back(createMaskOp);
   });
+
   rewriter.setInsertionPointToStart(&function.front());
   for (auto mask : worklist)
     (void)resolveAllTrueCreateMaskOp(rewriter, mask, *vscaleRange);
diff --git a/mlir/test/Dialect/Vector/eliminate-masks.mlir b/mlir/test/Dialect/Vector/eliminate-masks.mlir
index 99c9a60a09fac..51564d0ca1d4f 100644
--- a/mlir/test/Dialect/Vector/eliminate-masks.mlir
+++ b/mlir/test/Dialect/Vector/eliminate-masks.mlir
@@ -16,7 +16,7 @@ func.func @eliminate_redundant_masks_through_insert_and_extracts(%tensor: tensor
   %extracted_slice_0 = tensor.extract_slice %tensor[0, 0] [1, %c4_vscale] [1, 1] : tensor<1x1000xf32> to tensor<1x?xf32>
   %output_tensor = scf.for %i = %c0 to %c1000 step %c4_vscale iter_args(%arg = %extracted_slice_0) -> tensor<1x?xf32> {
     // 1. Extract a slice.
-    %extracted_slice_1 = tensor.extract_slice %arg[0, 0] [1, %c4_vscale] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
+    %extracted_slice_1 = tensor.extract_slice %arg[0, %i] [1, %c4_vscale] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
 
     // 2. Create a mask for the slice.
     %dim_1 = tensor.dim %extracted_slice_1, %c0 : tensor<?xf32>
@@ -30,7 +30,7 @@ func.func @eliminate_redundant_masks_through_insert_and_extracts(%tensor: tensor
     %write = vector.transfer_write %new_vec, %extracted_slice_1[%c0], %mask {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32>
 
     // 5. Insert and yield the new tensor value.
-    %result = tensor.insert_slice %write into %arg[0, 0] [1, %c4_vscale] [1, 1] : tensor<?xf32> into tensor<1x?xf32>
+    %result = tensor.insert_slice %write into %arg[0, %i] [1, %c4_vscale] [1, 1] : tensor<?xf32> into tensor<1x?xf32>
     scf.yield %result : tensor<1x?xf32>
   }
   "test.some_use"(%output_tensor) : (tensor<1x?xf32>) -> ()
@@ -57,8 +57,8 @@ func.func @negative_extract_slice_size_shrink(%tensor: tensor<1000xf32>) {
     %mask = vector.create_mask %dim : vector<[4]xi1>
     "test.some_use"(%mask) : (vector<[4]xi1>) -> ()
     // !!! Here the size of the mask could shrink in the next iteration.
-    %next_num_els = affine.min  affine_map<(d0)[s0] -> (-d0 + 1000, s0)>(%i)[%c4_vscale]
-    %new_extracted_slice = tensor.extract_slice %tensor[%c4_vscale] [%next_num_els] [1] : tensor<1000xf32> to tensor<?xf32>
+    %next_num_elts = affine.min  affine_map<(d0)[s0] -> (-d0 + 1000, s0)>(%i)[%c4_vscale]
+    %new_extracted_slice = tensor.extract_slice %tensor[%c4_vscale] [%next_num_elts] [1] : tensor<1000xf32> to tensor<?xf32>
     scf.yield %new_extracted_slice : tensor<?xf32>
   }
   "test.some_use"(%slice) : (tensor<?xf32>) -> ()
@@ -110,8 +110,8 @@ func.func @negative_value_bounds_fixed_dim_not_all_true(%tensor: tensor<2x?xf32>
   %c4 = arith.constant 4 : index
   %vscale = vector.vscale
   %c4_vscale = arith.muli %vscale, %c4 : index
-  // This is _very_ simple but since addi is not a constant value bounds will
-  // be used to resolve it.
+  // This is _very_ simple but since tensor.dim is not a constant value bounds
+  // will be used to resolve it.
   %dim = tensor.dim %tensor, %c0 : tensor<2x?xf32>
   %mask = vector.create_mask %dim, %c4_vscale : vector<3x[4]xi1>
   "test.some_use"(%mask) : (vector<3x[4]xi1>) -> ()
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index f74ff2725f815..69d8ec79407b5 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -885,15 +885,10 @@ struct TestEliminateVectorMasks
       : PassWrapper(pass) {}
 
   Option<unsigned> vscaleMin{
-      *this, "vscale-min",
-      llvm::cl::desc(
-          "Minimum value `vector.vscale` can possibly be at runtime."),
+      *this, "vscale-min", llvm::cl::desc("Minimum possible value of vscale."),
       llvm::cl::init(1)};
-
   Option<unsigned> vscaleMax{
-      *this, "vscale-max",
-      llvm::cl::desc(
-          "Maximum value `vector.vscale` can possibly be at runtime."),
+      *this, "vscale-max", llvm::cl::desc("Maximum possible value of vscale."),
       llvm::cl::init(16)};
 
   StringRef getArgument() const final { return "test-eliminate-vector-masks"; }

>From f9d83d61349f4d0109c11b2c0844b1366de4f6cf Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 22 Jul 2024 12:56:46 +0000
Subject: [PATCH 3/8] Comments

---
 mlir/test/Dialect/Vector/eliminate-masks.mlir | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mlir/test/Dialect/Vector/eliminate-masks.mlir b/mlir/test/Dialect/Vector/eliminate-masks.mlir
index 51564d0ca1d4f..ff859081fe5c8 100644
--- a/mlir/test/Dialect/Vector/eliminate-masks.mlir
+++ b/mlir/test/Dialect/Vector/eliminate-masks.mlir
@@ -51,8 +51,8 @@ func.func @negative_extract_slice_size_shrink(%tensor: tensor<1000xf32>) {
   %c4_vscale = arith.muli %vscale, %c4 : index
   %extracted_slice = tensor.extract_slice %tensor[0] [%c4_vscale] [1] : tensor<1000xf32> to tensor<?xf32>
   %slice = scf.for %i = %c0 to %c1000 step %c4_vscale iter_args(%arg = %extracted_slice) -> tensor<?xf32> {
-    // This mask cannot be eliminated even though looking at the above operations
-    // it appears `tensor.dim` will always be c4_vscale (so the mask all-true).
+    // This mask cannot be eliminated even though looking at the operations above
+    // (this comment) it appears `tensor.dim` will always be c4_vscale (so the mask all-true).
     %dim = tensor.dim %arg, %c0 : tensor<?xf32>
     %mask = vector.create_mask %dim : vector<[4]xi1>
     "test.some_use"(%mask) : (vector<[4]xi1>) -> ()
@@ -77,6 +77,8 @@ func.func @negative_constant_dim_not_all_true()
   %c4 = arith.constant 4 : index
   %vscale = vector.vscale
   %c4_vscale = arith.muli %vscale, %c4 : index
+  // Since %c1 is a constant, this will be found not to be all-true via simple
+  // pattern matching.
   %mask = vector.create_mask %c1, %c4_vscale : vector<2x[4]xi1>
   "test.some_use"(%mask) : (vector<2x[4]xi1>) -> ()
   return
@@ -93,6 +95,8 @@ func.func @negative_constant_vscale_multiple_not_all_true() {
   %c3 = arith.constant 3 : index
   %vscale = vector.vscale
   %c3_vscale = arith.muli %vscale, %c3 : index
+  // Since %c3_vscale is a constant vscale multiple, this will be found not to
+  // be all-true via simple pattern matching.
   %mask = vector.create_mask %c2, %c3_vscale : vector<2x[4]xi1>
   "test.some_use"(%mask) : (vector<2x[4]xi1>) -> ()
   return

>From f67b4cb27181786d9753afac79acee6be9940f8f Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Mon, 22 Jul 2024 23:43:59 +0000
Subject: [PATCH 4/8] fix

---
 llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 11f9f7822a15c..3ba56a1a3af9d 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -947,7 +947,8 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
                            SCEV::FlagAnyWrap);
     return Result;
   } else if (EnableVScaleImmediates)
-    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
+    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S);
+        M && M->getNumOperands() == 2)
       if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
         if (isa<SCEVVScale>(M->getOperand(1))) {
           S = SE.getConstant(M->getType(), 0);

>From 2276a36a3c84f5b98f1f16ad956613b6b74e3e12 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 23 Jul 2024 07:17:30 +0000
Subject: [PATCH 5/8] Revert "fix"

This reverts commit f67b4cb27181786d9753afac79acee6be9940f8f.
---
 llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 3ba56a1a3af9d..11f9f7822a15c 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -947,8 +947,7 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
                            SCEV::FlagAnyWrap);
     return Result;
   } else if (EnableVScaleImmediates)
-    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S);
-        M && M->getNumOperands() == 2)
+    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
       if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
         if (isa<SCEVVScale>(M->getOperand(1))) {
           S = SE.getConstant(M->getType(), 0);

>From d2c73c311980f2f8498601e143cac82bd1489161 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 23 Jul 2024 07:30:02 +0000
Subject: [PATCH 6/8] Precommit vscale-fixups.ll test

---
 .../AArch64/vscale-fixups.ll                  | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index 483955c1c57a0..56b59012eef40 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -384,4 +384,51 @@ for.exit:
   ret void
 }
 
+;; This test demonstrates an incorrect MUL VL address calculation. Here there
+;; are two writes that should be `16 * vscale * vscale` apart, however,
+;; loop-strength-reduce has ignored the second `vscale` and offset the second
+;; write by `#4, mul vl` which is an offset of `16 * vscale` dropping a vscale.
+define void @vscale_squared_offset(ptr %alloc) #0 {
+; COMMON-LABEL: vscale_squared_offset:
+; COMMON:       // %bb.0: // %entry
+; COMMON-NEXT:    fmov z0.s, #4.00000000
+; COMMON-NEXT:    mov x8, xzr
+; COMMON-NEXT:    cntw x9
+; COMMON-NEXT:    fmov z1.s, #8.00000000
+; COMMON-NEXT:    ptrue p0.s, vl1
+; COMMON-NEXT:    cmp x8, x9
+; COMMON-NEXT:    b.ge .LBB6_2
+; COMMON-NEXT:  .LBB6_1: // %for.body
+; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT:    st1w { z0.s }, p0, [x0]
+; COMMON-NEXT:    add x8, x8, #1
+; COMMON-NEXT:    st1w { z1.s }, p0, [x0, #4, mul vl]
+; COMMON-NEXT:    addvl x0, x0, #1
+; COMMON-NEXT:    cmp x8, x9
+; COMMON-NEXT:    b.lt .LBB6_1
+; COMMON-NEXT:  .LBB6_2: // %for.exit
+; COMMON-NEXT:    ret
+entry:
+  %vscale = call i64 @llvm.vscale.i64()
+  %c4_vscale = mul i64 %vscale, 4
+  br label %for.check
+for.check:
+  %i = phi i64 [ %next_i, %for.body ], [ 0, %entry ]
+  %is_lt = icmp slt i64 %i, %c4_vscale
+  br i1 %is_lt, label %for.body, label %for.exit
+for.body:
+  %mask = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i64(i64 0, i64 1)
+  %upper_offset = mul i64 %i, %c4_vscale
+  %upper_ptr = getelementptr float, ptr %alloc, i64 %upper_offset
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 4.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %upper_ptr, i32 4, <vscale x 4 x i1> %mask)
+  %lower_i = add i64 %i, %c4_vscale
+  %lower_offset = mul i64 %lower_i, %c4_vscale
+  %lower_ptr = getelementptr float, ptr %alloc, i64 %lower_offset
+  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 8.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), ptr %lower_ptr, i32 4, <vscale x 4 x i1> %mask)
+  %next_i = add i64 %i, 1
+  br label %for.check
+for.exit:
+  ret void
+}
+
 attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }

>From 028a4fa2b936f4b798fc3d738fcee4119f2ba699 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 23 Jul 2024 07:30:45 +0000
Subject: [PATCH 7/8] Revert "Revert "fix""

This reverts commit 2276a36a3c84f5b98f1f16ad956613b6b74e3e12.
---
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |  3 ++-
 .../AArch64/vscale-fixups.ll                  | 20 +++++++++++--------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 11f9f7822a15c..3ba56a1a3af9d 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -947,7 +947,8 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
                            SCEV::FlagAnyWrap);
     return Result;
   } else if (EnableVScaleImmediates)
-    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S))
+    if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S);
+        M && M->getNumOperands() == 2)
       if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
         if (isa<SCEVVScale>(M->getOperand(1))) {
           S = SE.getConstant(M->getType(), 0);
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
index 56b59012eef40..588696d20227f 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll
@@ -384,27 +384,31 @@ for.exit:
   ret void
 }
 
-;; This test demonstrates an incorrect MUL VL address calculation. Here there
-;; are two writes that should be `16 * vscale * vscale` apart, however,
-;; loop-strength-reduce has ignored the second `vscale` and offset the second
-;; write by `#4, mul vl` which is an offset of `16 * vscale` dropping a vscale.
+;; Here are two writes that should be `16 * vscale * vscale` apart, so MUL VL
+;; addressing cannot be used to offset the second write, as for example,
+;; `#4, mul vl` would only be an offset of `16 * vscale` (dropping a vscale).
 define void @vscale_squared_offset(ptr %alloc) #0 {
 ; COMMON-LABEL: vscale_squared_offset:
 ; COMMON:       // %bb.0: // %entry
+; COMMON-NEXT:    rdvl x9, #1
 ; COMMON-NEXT:    fmov z0.s, #4.00000000
 ; COMMON-NEXT:    mov x8, xzr
-; COMMON-NEXT:    cntw x9
+; COMMON-NEXT:    lsr x9, x9, #4
 ; COMMON-NEXT:    fmov z1.s, #8.00000000
+; COMMON-NEXT:    cntw x10
 ; COMMON-NEXT:    ptrue p0.s, vl1
-; COMMON-NEXT:    cmp x8, x9
+; COMMON-NEXT:    umull x9, w9, w9
+; COMMON-NEXT:    lsl x9, x9, #6
+; COMMON-NEXT:    cmp x8, x10
 ; COMMON-NEXT:    b.ge .LBB6_2
 ; COMMON-NEXT:  .LBB6_1: // %for.body
 ; COMMON-NEXT:    // =>This Inner Loop Header: Depth=1
+; COMMON-NEXT:    add x11, x0, x9
 ; COMMON-NEXT:    st1w { z0.s }, p0, [x0]
 ; COMMON-NEXT:    add x8, x8, #1
-; COMMON-NEXT:    st1w { z1.s }, p0, [x0, #4, mul vl]
+; COMMON-NEXT:    st1w { z1.s }, p0, [x11]
 ; COMMON-NEXT:    addvl x0, x0, #1
-; COMMON-NEXT:    cmp x8, x9
+; COMMON-NEXT:    cmp x8, x10
 ; COMMON-NEXT:    b.lt .LBB6_1
 ; COMMON-NEXT:  .LBB6_2: // %for.exit
 ; COMMON-NEXT:    ret

>From eff3f6451cd54055a4d2b6cff29b73ca8e427f8c Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell at arm.com>
Date: Tue, 23 Jul 2024 13:51:54 +0000
Subject: [PATCH 8/8] Review fixups

---
 .../Transforms/VectorMaskElimination.cpp      | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
index 486784a9cf102..9ad0de5cadeae 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorMaskElimination.cpp
@@ -68,26 +68,28 @@ LogicalResult resolveAllTrueCreateMaskOp(IRRewriter &rewriter,
   for (auto [i, dimSize] : unknownDims) {
     // Compute the lower bound for the unknown dimension (i.e. the smallest
     // value it could be).
-    auto lowerBound =
+    FailureOr<ConstantOrScalableBound> dimLowerBound =
         vector::ScalableValueBoundsConstraintSet::computeScalableBound(
             dimSize, {}, vscaleRange.vscaleMin, vscaleRange.vscaleMax,
             presburger::BoundType::LB);
-    if (failed(lowerBound))
+    if (failed(dimLowerBound))
       return failure();
-    auto boundSize = lowerBound->getSize();
-    if (failed(boundSize))
+    auto dimLowerBoundSize = dimLowerBound->getSize();
+    if (failed(dimLowerBoundSize))
       return failure();
-    if (boundSize->scalable) {
-      // If the lower bound is scalable and >= to the mask dim size then this
-      // dim is all-true.
-      if (boundSize->baseSize < maskTypeDimSizes[i])
+    if (dimLowerBoundSize->scalable) {
+      // If the lower bound is scalable and < the mask dim size then this dim is
+      // not all-true.
+      if (dimLowerBoundSize->baseSize < maskTypeDimSizes[i])
         return failure();
     } else {
-      // If the lower bound is a constant and >= to the _fixed-size_ mask dim
-      // size then this dim is all-true.
+      // If the lower bound is a constant:
+      // - If the mask dim size is scalable then this dim is not all-true.
       if (maskTypeDimScalableFlags[i])
         return failure();
-      if (boundSize->baseSize < maskTypeDimSizes[i])
+      // - If the lower bound is < the _fixed-size_ mask dim size then this dim
+      // is not all-true.
+      if (dimLowerBoundSize->baseSize < maskTypeDimSizes[i])
         return failure();
     }
   }