[Mlir-commits] [mlir] 778a249 - [mlir][Affine] affine-super-vectorize transform op (#177755)

Sun Jan 25 01:26:26 PST 2026

Author: Gabriel Dehame
Date: 2026-01-25T10:26:21+01:00
New Revision: 778a2491149512109541cd5d59bad2d55024bdb7

URL: https://github.com/llvm/llvm-project/commit/778a2491149512109541cd5d59bad2d55024bdb7
DIFF: https://github.com/llvm/llvm-project/commit/778a2491149512109541cd5d59bad2d55024bdb7.diff

LOG: [mlir][Affine] affine-super-vectorize transform op (#177755)

Added an operation in the transform dialect to apply the
affine-super-vectorize pass locally.
The operation vectorizes the loops that are children of the provided
operation, itself included.
Also added a test file verifying the operation behaves.

Added: 
    mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir

Modified: 
    mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
    mlir/include/mlir/Dialect/Affine/Utils.h
    mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
    mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
index 2969b4238dd67..8edc531e11bb5 100644

--- a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
+++ b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
@@ -94,4 +94,38 @@ def SimplifyMinMaxAffineOpsOp :
   }];
 }
 
+def SuperVectorizeOp
+    : Op<Transform_Dialect, "affine.super_vectorize",
+         [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+         DeclareOpInterfaceMethods<TransformOpInterface>]> {
+  let description = [{
+    Vectorize to a target independent n-D vector abstraction.
+    This operation is an exposition to the transform dialect of the affine-super-vectorize pass.
+    To make its usage easier, it ignores inputs which are children of an affine.for op (itself excluded),
+    this way the matcher can be simpler.
+
+    This operation may generate operations from the vector dialect.
+
+    Example:
+    ```
+    %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.affine.super_vectorize %0 [8, 16] fastest_varying_pattern=[1,0] vectorize_reductions=true : !transform.any_op
+    ```
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                       DenseI64ArrayAttr:$vector_sizes,
+                       OptionalAttr<DenseI64ArrayAttr>:$fastest_varying_pattern,
+                       DefaultValuedAttr<BoolAttr, "false">:$vectorize_reductions);
+  let results = (outs);
+
+  let assemblyFormat = [{
+      $target $vector_sizes
+          (`fastest_varying_pattern` `=` $fastest_varying_pattern^)? 
+          (`vectorize_reductions` `=` $vectorize_reductions^)? 
+          attr-dict `:` type($target)
+  }];
+  let hasVerifier = 1;
+}
+
 #endif // Affine_TRANSFORM_OPS

diff  --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
index ac11f5a7c24c7..d44d1389430ba 100644
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -103,6 +103,11 @@ struct VectorizationStrategy {
   ReductionLoopMap reductionLoops;
 };
 
+/// Vectorize affine loops that are children of parentOp (including itself)
+void vectorizeChildAffineLoops(Operation *parentOp, bool vectorizeReductions,
+                               ArrayRef<int64_t> vectorSizes,
+                               ArrayRef<int64_t> fastestVaryingPattern);
+
 /// Replace affine store and load accesses by scalars by forwarding stores to
 /// loads and eliminate invariant affine loads; consequently, eliminate dead
 /// allocs.

diff  --git a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
index b1e40d9b289ec..d5930d9e3d77f 100644
--- a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
+++ b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
@@ -13,9 +13,13 @@
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Affine/Transforms/Transforms.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/ArrayRef.h"
+#include <cstdint>
 
 using namespace mlir;
 using namespace mlir::affine;
@@ -149,6 +153,41 @@ void SimplifyBoundedAffineOpsOp::getEffects(
   modifiesPayload(effects);
 }
 
+//===----------------------------------------------------------------------===//
+// SimplifyMinMaxAffineOpsOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult SuperVectorizeOp::verify() {
+  if (getFastestVaryingPattern().has_value()) {
+    if (getFastestVaryingPattern()->size() != getVectorSizes().size())
+      return emitOpError()
+             << "fastest varying pattern specified with 
diff erent size than "
+                "the vector size";
+  }
+  return success();
+}
+
+DiagnosedSilenceableFailure
+SuperVectorizeOp::apply(transform::TransformRewriter &rewriter,
+                        TransformResults &results, TransformState &state) {
+  ArrayRef<int64_t> fastestVaryingPattern;
+  if (getFastestVaryingPattern().has_value())
+    fastestVaryingPattern = getFastestVaryingPattern().value();
+
+  for (Operation *target : state.getPayloadOps(getTarget()))
+    if (!target->getParentOfType<affine::AffineForOp>())
+      vectorizeChildAffineLoops(target, getVectorizeReductions(),
+                                getVectorSizes(), fastestVaryingPattern);
+
+  return DiagnosedSilenceableFailure::success();
+}
+
+void SuperVectorizeOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  consumesHandle(getTargetMutable(), effects);
+  modifiesPayload(effects);
+}
+
 //===----------------------------------------------------------------------===//
 // SimplifyMinMaxAffineOpsOp
 //===----------------------------------------------------------------------===//
@@ -200,6 +239,7 @@ class AffineTransformDialectExtension
 
   void init() {
     declareGeneratedDialect<AffineDialect>();
+    declareGeneratedDialect<vector::VectorDialect>();
 
     registerTransformOps<
 #define GET_OP_LIST

diff  --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index a09f8593fab10..e350ab4348271 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -1774,34 +1774,16 @@ static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
   LLVM_DEBUG(dbgs() << "\n");
 }
 
-/// Applies vectorization to the current function by searching over a bunch of
-/// predetermined patterns.
-void Vectorize::runOnOperation() {
-  func::FuncOp f = getOperation();
-  if (!fastestVaryingPattern.empty() &&
-      fastestVaryingPattern.size() != vectorSizes.size()) {
-    f.emitRemark("Fastest varying pattern specified with 
diff erent size than "
-                 "the vector size.");
-    return signalPassFailure();
-  }
-
-  if (vectorizeReductions && vectorSizes.size() != 1) {
-    f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
-    return signalPassFailure();
-  }
-
-  if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) {
-    f.emitError("Vectorization factor must be greater than zero.");
-    return signalPassFailure();
-  }
-
+void affine::vectorizeChildAffineLoops(
+    Operation *parentOp, bool vectorizeReductions,
+    ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern) {
   DenseSet<Operation *> parallelLoops;
   ReductionLoopMap reductionLoops;
 
   // If 'vectorize-reduction=true' is provided, we also populate the
   // `reductionLoops` map.
   if (vectorizeReductions) {
-    f.walk([&parallelLoops, &reductionLoops](AffineForOp loop) {
+    parentOp->walk([&parallelLoops, &reductionLoops](AffineForOp loop) {
       SmallVector<LoopReduction, 2> reductions;
       if (isLoopParallel(loop, &reductions)) {
         parallelLoops.insert(loop);
@@ -1811,7 +1793,7 @@ void Vectorize::runOnOperation() {
       }
     });
   } else {
-    f.walk([&parallelLoops](AffineForOp loop) {
+    parentOp->walk([&parallelLoops](AffineForOp loop) {
       if (isLoopParallel(loop))
         parallelLoops.insert(loop);
     });
@@ -1819,10 +1801,35 @@ void Vectorize::runOnOperation() {
 
   // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
   NestedPatternContext mlContext;
-  vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,
+  vectorizeLoops(parentOp, parallelLoops, vectorSizes, fastestVaryingPattern,
                  reductionLoops);
 }
 
+/// Applies vectorization to the current function by searching over a bunch of
+/// predetermined patterns.
+void Vectorize::runOnOperation() {
+  func::FuncOp f = getOperation();
+  if (!fastestVaryingPattern.empty() &&
+      fastestVaryingPattern.size() != vectorSizes.size()) {
+    f.emitRemark("Fastest varying pattern specified with 
diff erent size than "
+                 "the vector size.");
+    return signalPassFailure();
+  }
+
+  if (vectorizeReductions && vectorSizes.size() != 1) {
+    f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
+    return signalPassFailure();
+  }
+
+  if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) {
+    f.emitError("Vectorization factor must be greater than zero.");
+    return signalPassFailure();
+  }
+
+  vectorizeChildAffineLoops(f, vectorizeReductions, vectorSizes,
+                            fastestVaryingPattern);
+}
+
 /// Verify that affine loops in 'loops' meet the nesting criteria expected by
 /// SuperVectorizer:
 ///   * There must be at least one loop.

diff  --git a/mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir b/mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir
new file mode 100644
index 0000000000000..5734362ee855f
--- /dev/null
+++ b/mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir
@@ -0,0 +1,129 @@
+//RUN: mlir-opt %s --transform-interpreter -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @vec1d_3
+func.func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+   %c0 = arith.constant 0 : index
+   %c1 = arith.constant 1 : index
+   %c2 = arith.constant 2 : index
+   %M = memref.dim %A, %c0 : memref<?x?xf32>
+   %N = memref.dim %A, %c1 : memref<?x?xf32>
+   %P = memref.dim %B, %c2 : memref<?x?x?xf32>
+
+// CHECK:   %{{.*}} = vector.transfer_read {{.*}} : memref<?x?xf32>, vector<128xf32>
+   affine.for %i8 = 0 to %M { // vectorized
+     affine.for %i9 = 0 to %N {
+       %a9 = affine.load %A[%i9, %i8 + %i9] : memref<?x?xf32>
+     }
+   }
+   return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.affine.super_vectorize %0 [128] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func @vectorize_matmul
+func.func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %M = memref.dim %arg0, %c0 : memref<?x?xf32>
+  %K = memref.dim %arg0, %c1 : memref<?x?xf32>
+  %N = memref.dim %arg2, %c1 : memref<?x?xf32>
+  // CHECK:     vector.transfer_write {{.*}} : vector<4x8xf32>, memref<?x?xf32>
+  affine.for %i0 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%M) {
+    affine.for %i1 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%N) {
+      %cst = arith.constant 0.000000e+00 : f32
+      affine.store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
+    }
+  }
+  //      CHECK:        %{{.*}} = vector.transfer_read {{.*}} : memref<?x?xf32>, vector<4x8xf32>
+  //      CHECK:        %{{.*}} = vector.transfer_read {{.*}} : memref<?x?xf32>, vector<4x8xf32>
+  //      CHECK:        %{{.*}} = vector.transfer_read {{.*}} : memref<?x?xf32>, vector<4x8xf32>
+  //      CHECK:        vector.transfer_write %{{.*}} : vector<4x8xf32>, memref<?x?xf32>
+  affine.for %i2 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%M) {
+    affine.for %i3 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%N) {
+      affine.for %i4 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%K) {
+        %6 = affine.load %arg1[%i4, %i3] : memref<?x?xf32>
+        %7 = affine.load %arg0[%i2, %i4] : memref<?x?xf32>
+        %8 = arith.mulf %7, %6 : f32
+        %9 = affine.load %arg2[%i2, %i3] : memref<?x?xf32>
+        %10 = arith.addf %9, %8 : f32
+        affine.store %10, %arg2[%i2, %i3] : memref<?x?xf32>
+      }
+    }
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.affine.super_vectorize %0 [4, 8] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @vec3d(%A : memref<?x?x?xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %0 = memref.dim %A, %c0 : memref<?x?x?xf32>
+  %1 = memref.dim %A, %c1 : memref<?x?x?xf32>
+  %2 = memref.dim %A, %c2 : memref<?x?x?xf32>
+  // CHECK:           %{{.*}} = vector.transfer_read {{.*}} : memref<?x?x?xf32>, vector<32x64x256xf32>
+  affine.for %t0 = 0 to %0 {
+    affine.for %t1 = 0 to %0 {
+      affine.for %i0 = 0 to %0 {
+        affine.for %i1 = 0 to %1 {
+          affine.for %i2 = 0 to %2 {
+            %a2 = affine.load %A[%i0, %i1, %i2] : memref<?x?x?xf32>
+          }
+        }
+      }
+    }
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.affine.super_vectorize %0 [32, 64, 256] fastest_varying_pattern=[2,1,0] : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+func.func @vecdim_reduction_minf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = arith.constant 0x7F800000 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %min = arith.minimumf %red_iter, %ld : f32
+     affine.yield %min : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_minf
+// CHECK:           %{{.*}} = vector.transfer_read {{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK:         %{{.*}} = vector.reduction <minimumf>, {{.*}} : vector<128xf32> into f32
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.affine.super_vectorize %0 [128] vectorize_reductions=true : !transform.any_op
+    transform.yield
+  }
+}