[Mlir-commits] [mlir] 778a249 - [mlir][Affine] affine-super-vectorize transform op (#177755)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Sun Jan 25 01:26:26 PST 2026
Author: Gabriel Dehame
Date: 2026-01-25T10:26:21+01:00
New Revision: 778a2491149512109541cd5d59bad2d55024bdb7
URL: https://github.com/llvm/llvm-project/commit/778a2491149512109541cd5d59bad2d55024bdb7
DIFF: https://github.com/llvm/llvm-project/commit/778a2491149512109541cd5d59bad2d55024bdb7.diff
LOG: [mlir][Affine] affine-super-vectorize transform op (#177755)
Added an operation in the transform dialect to apply the
affine-super-vectorize pass locally.
The operation vectorizes the loops that are children of the provided
operation, itself included.
Also added a test file verifying the operation behaves.
Added:
mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir
Modified:
mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
mlir/include/mlir/Dialect/Affine/Utils.h
mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
index 2969b4238dd67..8edc531e11bb5 100644
--- a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
+++ b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
@@ -94,4 +94,38 @@ def SimplifyMinMaxAffineOpsOp :
}];
}
+def SuperVectorizeOp
+ : Op<Transform_Dialect, "affine.super_vectorize",
+ [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+ DeclareOpInterfaceMethods<TransformOpInterface>]> {
+ let description = [{
+ Vectorize to a target independent n-D vector abstraction.
+ This operation is an exposition to the transform dialect of the affine-super-vectorize pass.
+ To make its usage easier, it ignores inputs which are children of an affine.for op (itself excluded),
+ this way the matcher can be simpler.
+
+ This operation may generate operations from the vector dialect.
+
+ Example:
+ ```
+ %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.affine.super_vectorize %0 [8, 16] fastest_varying_pattern=[1,0] vectorize_reductions=true : !transform.any_op
+ ```
+ }];
+
+ let arguments = (ins TransformHandleTypeInterface:$target,
+ DenseI64ArrayAttr:$vector_sizes,
+ OptionalAttr<DenseI64ArrayAttr>:$fastest_varying_pattern,
+ DefaultValuedAttr<BoolAttr, "false">:$vectorize_reductions);
+ let results = (outs);
+
+ let assemblyFormat = [{
+ $target $vector_sizes
+ (`fastest_varying_pattern` `=` $fastest_varying_pattern^)?
+ (`vectorize_reductions` `=` $vectorize_reductions^)?
+ attr-dict `:` type($target)
+ }];
+ let hasVerifier = 1;
+}
+
#endif // Affine_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
index ac11f5a7c24c7..d44d1389430ba 100644
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -103,6 +103,11 @@ struct VectorizationStrategy {
ReductionLoopMap reductionLoops;
};
+/// Vectorize affine loops that are children of parentOp (including itself)
+void vectorizeChildAffineLoops(Operation *parentOp, bool vectorizeReductions,
+ ArrayRef<int64_t> vectorSizes,
+ ArrayRef<int64_t> fastestVaryingPattern);
+
/// Replace affine store and load accesses by scalars by forwarding stores to
/// loads and eliminate invariant affine loads; consequently, eliminate dead
/// allocs.
diff --git a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
index b1e40d9b289ec..d5930d9e3d77f 100644
--- a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
+++ b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
@@ -13,9 +13,13 @@
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
#include "mlir/Dialect/Affine/Transforms/Transforms.h"
+#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/ArrayRef.h"
+#include <cstdint>
using namespace mlir;
using namespace mlir::affine;
@@ -149,6 +153,41 @@ void SimplifyBoundedAffineOpsOp::getEffects(
modifiesPayload(effects);
}
+//===----------------------------------------------------------------------===//
+// SimplifyMinMaxAffineOpsOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult SuperVectorizeOp::verify() {
+ if (getFastestVaryingPattern().has_value()) {
+ if (getFastestVaryingPattern()->size() != getVectorSizes().size())
+ return emitOpError()
+ << "fastest varying pattern specified with
diff erent size than "
+ "the vector size";
+ }
+ return success();
+}
+
+DiagnosedSilenceableFailure
+SuperVectorizeOp::apply(transform::TransformRewriter &rewriter,
+ TransformResults &results, TransformState &state) {
+ ArrayRef<int64_t> fastestVaryingPattern;
+ if (getFastestVaryingPattern().has_value())
+ fastestVaryingPattern = getFastestVaryingPattern().value();
+
+ for (Operation *target : state.getPayloadOps(getTarget()))
+ if (!target->getParentOfType<affine::AffineForOp>())
+ vectorizeChildAffineLoops(target, getVectorizeReductions(),
+ getVectorSizes(), fastestVaryingPattern);
+
+ return DiagnosedSilenceableFailure::success();
+}
+
+void SuperVectorizeOp::getEffects(
+ SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+ consumesHandle(getTargetMutable(), effects);
+ modifiesPayload(effects);
+}
+
//===----------------------------------------------------------------------===//
// SimplifyMinMaxAffineOpsOp
//===----------------------------------------------------------------------===//
@@ -200,6 +239,7 @@ class AffineTransformDialectExtension
void init() {
declareGeneratedDialect<AffineDialect>();
+ declareGeneratedDialect<vector::VectorDialect>();
registerTransformOps<
#define GET_OP_LIST
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index a09f8593fab10..e350ab4348271 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -1774,34 +1774,16 @@ static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
LLVM_DEBUG(dbgs() << "\n");
}
-/// Applies vectorization to the current function by searching over a bunch of
-/// predetermined patterns.
-void Vectorize::runOnOperation() {
- func::FuncOp f = getOperation();
- if (!fastestVaryingPattern.empty() &&
- fastestVaryingPattern.size() != vectorSizes.size()) {
- f.emitRemark("Fastest varying pattern specified with
diff erent size than "
- "the vector size.");
- return signalPassFailure();
- }
-
- if (vectorizeReductions && vectorSizes.size() != 1) {
- f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
- return signalPassFailure();
- }
-
- if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) {
- f.emitError("Vectorization factor must be greater than zero.");
- return signalPassFailure();
- }
-
+void affine::vectorizeChildAffineLoops(
+ Operation *parentOp, bool vectorizeReductions,
+ ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern) {
DenseSet<Operation *> parallelLoops;
ReductionLoopMap reductionLoops;
// If 'vectorize-reduction=true' is provided, we also populate the
// `reductionLoops` map.
if (vectorizeReductions) {
- f.walk([¶llelLoops, &reductionLoops](AffineForOp loop) {
+ parentOp->walk([¶llelLoops, &reductionLoops](AffineForOp loop) {
SmallVector<LoopReduction, 2> reductions;
if (isLoopParallel(loop, &reductions)) {
parallelLoops.insert(loop);
@@ -1811,7 +1793,7 @@ void Vectorize::runOnOperation() {
}
});
} else {
- f.walk([¶llelLoops](AffineForOp loop) {
+ parentOp->walk([¶llelLoops](AffineForOp loop) {
if (isLoopParallel(loop))
parallelLoops.insert(loop);
});
@@ -1819,10 +1801,35 @@ void Vectorize::runOnOperation() {
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;
- vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,
+ vectorizeLoops(parentOp, parallelLoops, vectorSizes, fastestVaryingPattern,
reductionLoops);
}
+/// Applies vectorization to the current function by searching over a bunch of
+/// predetermined patterns.
+void Vectorize::runOnOperation() {
+ func::FuncOp f = getOperation();
+ if (!fastestVaryingPattern.empty() &&
+ fastestVaryingPattern.size() != vectorSizes.size()) {
+ f.emitRemark("Fastest varying pattern specified with
diff erent size than "
+ "the vector size.");
+ return signalPassFailure();
+ }
+
+ if (vectorizeReductions && vectorSizes.size() != 1) {
+ f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
+ return signalPassFailure();
+ }
+
+ if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) {
+ f.emitError("Vectorization factor must be greater than zero.");
+ return signalPassFailure();
+ }
+
+ vectorizeChildAffineLoops(f, vectorizeReductions, vectorSizes,
+ fastestVaryingPattern);
+}
+
/// Verify that affine loops in 'loops' meet the nesting criteria expected by
/// SuperVectorizer:
/// * There must be at least one loop.
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir b/mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir
new file mode 100644
index 0000000000000..5734362ee855f
--- /dev/null
+++ b/mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir
@@ -0,0 +1,129 @@
+//RUN: mlir-opt %s --transform-interpreter -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @vec1d_3
+func.func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %M = memref.dim %A, %c0 : memref<?x?xf32>
+ %N = memref.dim %A, %c1 : memref<?x?xf32>
+ %P = memref.dim %B, %c2 : memref<?x?x?xf32>
+
+// CHECK: %{{.*}} = vector.transfer_read {{.*}} : memref<?x?xf32>, vector<128xf32>
+ affine.for %i8 = 0 to %M { // vectorized
+ affine.for %i9 = 0 to %N {
+ %a9 = affine.load %A[%i9, %i8 + %i9] : memref<?x?xf32>
+ }
+ }
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.affine.super_vectorize %0 [128] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: func @vectorize_matmul
+func.func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %M = memref.dim %arg0, %c0 : memref<?x?xf32>
+ %K = memref.dim %arg0, %c1 : memref<?x?xf32>
+ %N = memref.dim %arg2, %c1 : memref<?x?xf32>
+ // CHECK: vector.transfer_write {{.*}} : vector<4x8xf32>, memref<?x?xf32>
+ affine.for %i0 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%M) {
+ affine.for %i1 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%N) {
+ %cst = arith.constant 0.000000e+00 : f32
+ affine.store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
+ }
+ }
+ // CHECK: %{{.*}} = vector.transfer_read {{.*}} : memref<?x?xf32>, vector<4x8xf32>
+ // CHECK: %{{.*}} = vector.transfer_read {{.*}} : memref<?x?xf32>, vector<4x8xf32>
+ // CHECK: %{{.*}} = vector.transfer_read {{.*}} : memref<?x?xf32>, vector<4x8xf32>
+ // CHECK: vector.transfer_write %{{.*}} : vector<4x8xf32>, memref<?x?xf32>
+ affine.for %i2 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%M) {
+ affine.for %i3 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%N) {
+ affine.for %i4 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%K) {
+ %6 = affine.load %arg1[%i4, %i3] : memref<?x?xf32>
+ %7 = affine.load %arg0[%i2, %i4] : memref<?x?xf32>
+ %8 = arith.mulf %7, %6 : f32
+ %9 = affine.load %arg2[%i2, %i3] : memref<?x?xf32>
+ %10 = arith.addf %9, %8 : f32
+ affine.store %10, %arg2[%i2, %i3] : memref<?x?xf32>
+ }
+ }
+ }
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.affine.super_vectorize %0 [4, 8] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @vec3d(%A : memref<?x?x?xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %0 = memref.dim %A, %c0 : memref<?x?x?xf32>
+ %1 = memref.dim %A, %c1 : memref<?x?x?xf32>
+ %2 = memref.dim %A, %c2 : memref<?x?x?xf32>
+ // CHECK: %{{.*}} = vector.transfer_read {{.*}} : memref<?x?x?xf32>, vector<32x64x256xf32>
+ affine.for %t0 = 0 to %0 {
+ affine.for %t1 = 0 to %0 {
+ affine.for %i0 = 0 to %0 {
+ affine.for %i1 = 0 to %1 {
+ affine.for %i2 = 0 to %2 {
+ %a2 = affine.load %A[%i0, %i1, %i2] : memref<?x?x?xf32>
+ }
+ }
+ }
+ }
+ }
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.affine.super_vectorize %0 [32, 64, 256] fastest_varying_pattern=[2,1,0] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @vecdim_reduction_minf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = arith.constant 0x7F800000 : f32
+ affine.for %i = 0 to 256 {
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %min = arith.minimumf %red_iter, %ld : f32
+ affine.yield %min : f32
+ }
+ affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL: @vecdim_reduction_minf
+// CHECK: %{{.*}} = vector.transfer_read {{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %{{.*}} = vector.reduction <minimumf>, {{.*}} : vector<128xf32> into f32
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.affine.super_vectorize %0 [128] vectorize_reductions=true : !transform.any_op
+ transform.yield
+ }
+}
More information about the Mlir-commits
mailing list