[Mlir-commits] [mlir] [mlir][Affine] affine-super-vectorize transform op (PR #126522)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Feb 10 06:28:46 PST 2025
https://github.com/gdehame created https://github.com/llvm/llvm-project/pull/126522
Added an operation in the transform dialect to apply the affine-super-vectorize pass locally.
The operation vectorizes the loops that are children of the provided operation, itself included.
Also added a test file verifying the operation behaves.
>From 086e4b2fe0014ebfa8fe753556ae482a657943e7 Mon Sep 17 00:00:00 2001
From: gdehame <gabrieldehame at gmail.com>
Date: Mon, 10 Feb 2025 15:18:07 +0100
Subject: [PATCH] [mlir][Affine] affine-super-vectorize transform op
Added an operation in the transform dialect to apply the affine-super-vectorize pass locally.
The operation vectorizes the loops that are children of the provided operation, itself included.
Also added a test file verifying the operation behaves.
---
.../Affine/TransformOps/AffineTransformOps.td | 31 ++++
mlir/include/mlir/Dialect/Affine/Utils.h | 6 +
.../TransformOps/AffineTransformOps.cpp | 32 ++++
.../Affine/Transforms/SuperVectorize.cpp | 56 +++---
.../Affine/SuperVectorize/transform_op.mlir | 169 ++++++++++++++++++
5 files changed, 270 insertions(+), 24 deletions(-)
create mode 100644 mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir
diff --git a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
index 70b127fd063caf..b0c0ee63839dfa 100644
--- a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
+++ b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
@@ -63,4 +63,35 @@ def SimplifyBoundedAffineOpsOp
}];
}
+def SuperVectorizeOp
+ : Op<Transform_Dialect, "affine.super_vectorize",
+ [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+ DeclareOpInterfaceMethods<TransformOpInterface>]> {
+ let description = [{
+ Vectorize to a target independent n-D vector abstraction.
+ This operation is an exposition to the transform dialect of the affine-super-vectorize pass.
+ To make its usage easier, it ignores inputs which are children of an affine.for op (itself excluded),
+ this way the matcher can be simpler.
+
+ Example:
+ ```
+ %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.affine.super_vectorize %0 [8, 16] fastest_varying_pattern=[1,0] vectorize_reductions=true : !transform.any_op
+ ```
+ }];
+
+ let arguments = (ins TransformHandleTypeInterface:$target,
+ DenseI64ArrayAttr:$vector_sizes,
+ OptionalAttr<DenseI64ArrayAttr>:$fastest_varying_pattern,
+ DefaultValuedAttr<BoolAttr, "false">:$vectorize_reductions);
+ let results = (outs);
+
+ let assemblyFormat = [{
+ $target $vector_sizes
+ (`fastest_varying_pattern` `=` $fastest_varying_pattern^)?
+ (`vectorize_reductions` `=` $vectorize_reductions^)?
+ attr-dict `:` type($target)
+ }];
+}
+
#endif // Affine_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/Affine/Utils.h b/mlir/include/mlir/Dialect/Affine/Utils.h
index ff1900bc8f2ebc..5a094921ebe8ed 100644
--- a/mlir/include/mlir/Dialect/Affine/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Utils.h
@@ -102,6 +102,12 @@ struct VectorizationStrategy {
ReductionLoopMap reductionLoops;
};
+/// Vectorize affine loops that are children of parentOp (including itself)
+void vectorizeChildAffineLoops(Operation* parentOp,
+ bool vectorizeReductions,
+ ArrayRef<int64_t> vectorSizes,
+ ArrayRef<int64_t> fastestVaryingPattern);
+
/// Replace affine store and load accesses by scalars by forwarding stores to
/// loads and eliminate invariant affine loads; consequently, eliminate dead
/// allocs.
diff --git a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
index 9f7df7823d9979..b60de25f24d46a 100644
--- a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
+++ b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
@@ -12,9 +12,12 @@
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
#include "mlir/Dialect/Affine/LoopUtils.h"
+#include "mlir/Dialect/Affine/Utils.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/ArrayRef.h"
+#include <cstdint>
using namespace mlir;
using namespace mlir::affine;
@@ -148,6 +151,35 @@ void SimplifyBoundedAffineOpsOp::getEffects(
modifiesPayload(effects);
}
+DiagnosedSilenceableFailure
+SuperVectorizeOp::apply(transform::TransformRewriter &rewriter,
+ TransformResults &results,
+ TransformState &state) {
+ ArrayRef<int64_t> fastestVaryingPattern;
+ if (getFastestVaryingPattern().has_value()) {
+ if (getFastestVaryingPattern()->size() != getVectorSizes().size())
+ return emitSilenceableFailure(getLoc(),
+ "Fastest varying pattern specified with different size than "
+ "the vector size.");
+ fastestVaryingPattern = getFastestVaryingPattern().value();
+ }
+
+ for (Operation* target : state.getPayloadOps(getTarget()))
+ if (!target->getParentOfType<affine::AffineForOp>())
+ vectorizeChildAffineLoops(target,
+ getVectorizeReductions(),
+ getVectorSizes(),
+ fastestVaryingPattern);
+
+ return DiagnosedSilenceableFailure::success();
+}
+
+void SuperVectorizeOp::getEffects(
+ SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+ consumesHandle(getTargetMutable(), effects);
+ modifiesPayload(effects);
+}
+
//===----------------------------------------------------------------------===//
// Transform op registration
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index 71e9648a5e00fa..4d9bd07c0bad78 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -26,6 +26,7 @@
#include "mlir/IR/IRMapping.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Debug.h"
#include <optional>
@@ -1741,34 +1742,17 @@ static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
LLVM_DEBUG(dbgs() << "\n");
}
-/// Applies vectorization to the current function by searching over a bunch of
-/// predetermined patterns.
-void Vectorize::runOnOperation() {
- func::FuncOp f = getOperation();
- if (!fastestVaryingPattern.empty() &&
- fastestVaryingPattern.size() != vectorSizes.size()) {
- f.emitRemark("Fastest varying pattern specified with different size than "
- "the vector size.");
- return signalPassFailure();
- }
-
- if (vectorizeReductions && vectorSizes.size() != 1) {
- f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
- return signalPassFailure();
- }
-
- if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) {
- f.emitError("Vectorization factor must be greater than zero.");
- return signalPassFailure();
- }
-
+void affine::vectorizeChildAffineLoops(Operation* parentOp,
+ bool vectorizeReductions,
+ ArrayRef<int64_t> vectorSizes,
+ ArrayRef<int64_t> fastestVaryingPattern) {
DenseSet<Operation *> parallelLoops;
ReductionLoopMap reductionLoops;
// If 'vectorize-reduction=true' is provided, we also populate the
// `reductionLoops` map.
if (vectorizeReductions) {
- f.walk([¶llelLoops, &reductionLoops](AffineForOp loop) {
+ parentOp->walk([¶llelLoops, &reductionLoops](AffineForOp loop) {
SmallVector<LoopReduction, 2> reductions;
if (isLoopParallel(loop, &reductions)) {
parallelLoops.insert(loop);
@@ -1778,7 +1762,7 @@ void Vectorize::runOnOperation() {
}
});
} else {
- f.walk([¶llelLoops](AffineForOp loop) {
+ parentOp->walk([¶llelLoops](AffineForOp loop) {
if (isLoopParallel(loop))
parallelLoops.insert(loop);
});
@@ -1786,10 +1770,34 @@ void Vectorize::runOnOperation() {
// Thread-safe RAII local context, BumpPtrAllocator freed on exit.
NestedPatternContext mlContext;
- vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern,
+ vectorizeLoops(parentOp, parallelLoops, vectorSizes, fastestVaryingPattern,
reductionLoops);
}
+/// Applies vectorization to the current function by searching over a bunch of
+/// predetermined patterns.
+void Vectorize::runOnOperation() {
+ func::FuncOp f = getOperation();
+ if (!fastestVaryingPattern.empty() &&
+ fastestVaryingPattern.size() != vectorSizes.size()) {
+ f.emitRemark("Fastest varying pattern specified with different size than "
+ "the vector size.");
+ return signalPassFailure();
+ }
+
+ if (vectorizeReductions && vectorSizes.size() != 1) {
+ f.emitError("Vectorizing reductions is supported only for 1-D vectors.");
+ return signalPassFailure();
+ }
+
+ if (llvm::any_of(vectorSizes, [](int64_t size) { return size <= 0; })) {
+ f.emitError("Vectorization factor must be greater than zero.");
+ return signalPassFailure();
+ }
+
+ vectorizeChildAffineLoops(f, vectorizeReductions, vectorSizes, fastestVaryingPattern);
+}
+
/// Verify that affine loops in 'loops' meet the nesting criteria expected by
/// SuperVectorizer:
/// * There must be at least one loop.
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir b/mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir
new file mode 100644
index 00000000000000..07fd2e4ce1506d
--- /dev/null
+++ b/mlir/test/Dialect/Affine/SuperVectorize/transform_op.mlir
@@ -0,0 +1,169 @@
+//RUN: mlir-opt %s --transform-interpreter -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @vec1d_3
+func.func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG: [[ARG_M:%[0-9a-zA-Z_]+]] = memref.dim %arg0, %[[C0]] : memref<?x?xf32>
+// CHECK-DAG: [[ARG_N:%[0-9a-zA-Z_]+]] = memref.dim %arg0, %[[C1]] : memref<?x?xf32>
+// CHECK-DAG: [[ARG_P:%[0-9a-zA-Z_]+]] = memref.dim %arg1, %[[C2]] : memref<?x?x?xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %M = memref.dim %A, %c0 : memref<?x?xf32>
+ %N = memref.dim %A, %c1 : memref<?x?xf32>
+ %P = memref.dim %B, %c2 : memref<?x?x?xf32>
+
+// CHECK:for [[IV8:%[0-9a-zA-Z_]+]] = 0 to [[ARG_M]] step 128
+// CHECK-NEXT: for [[IV9:%[0-9a-zA-Z_]*]] = 0 to [[ARG_N]] {
+// CHECK-NEXT: %[[APP9_0:[0-9a-zA-Z_]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
+// CHECK-NEXT: %[[APP9_1:[0-9a-zA-Z_]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
+// CHECK-NEXT: %[[CST:.*]] = arith.constant 0.0{{.*}}: f32
+// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %[[CST]] : memref<?x?xf32>, vector<128xf32>
+ affine.for %i8 = 0 to %M { // vectorized
+ affine.for %i9 = 0 to %N {
+ %a9 = affine.load %A[%i9, %i8 + %i9] : memref<?x?xf32>
+ }
+ }
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.affine.super_vectorize %0 [128] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_id1:map[0-9]*]] = affine_map<(d0) -> (d0)>
+// CHECK-DAG: #[[$map_proj_d0d1_zerod1:map[0-9]*]] = affine_map<(d0, d1) -> (0, d1)>
+// CHECK-DAG: #[[$map_proj_d0d1_d0zero:map[0-9]*]] = affine_map<(d0, d1) -> (d0, 0)>
+// CHECK-LABEL: func @vectorize_matmul
+func.func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %M = memref.dim %arg0, %c0 : memref<?x?xf32>
+ %K = memref.dim %arg0, %c1 : memref<?x?xf32>
+ %N = memref.dim %arg2, %c1 : memref<?x?xf32>
+ // CHECK: %[[C0:.*]] = arith.constant 0 : index
+ // CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index
+ // CHECK-NEXT: %[[M:.*]] = memref.dim %{{.*}}, %[[C0]] : memref<?x?xf32>
+ // CHECK-NEXT: %[[K:.*]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
+ // CHECK-NEXT: %[[N:.*]] = memref.dim %{{.*}}, %[[C1]] : memref<?x?xf32>
+ // CHECK: {{.*}} #[[$map_id1]](%[[M]]) step 4 {
+ // CHECK-NEXT: {{.*}} #[[$map_id1]](%[[N]]) step 8 {
+ // CHECK: %[[VC0:.*]] = arith.constant dense<0.000000e+00> : vector<4x8xf32>
+ // CHECK-NEXT: vector.transfer_write %[[VC0]], %{{.*}}[%{{.*}}, %{{.*}}] : vector<4x8xf32>, memref<?x?xf32>
+ affine.for %i0 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%M) {
+ affine.for %i1 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%N) {
+ %cst = arith.constant 0.000000e+00 : f32
+ affine.store %cst, %arg2[%i0, %i1] : memref<?x?xf32>
+ }
+ }
+ // CHECK: affine.for %[[I2:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[M]]) step 4 {
+ // CHECK-NEXT: affine.for %[[I3:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[N]]) step 8 {
+ // CHECK-NEXT: affine.for %[[I4:.*]] = #[[$map_id1]](%[[C0]]) to #[[$map_id1]](%[[K]]) {
+ // CHECK: %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
+ // CHECK: %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[$map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
+ // CHECK-NEXT: %[[C:.*]] = arith.mulf %[[B]], %[[A]] : vector<4x8xf32>
+ // CHECK: %[[D:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I3]]], %{{.*}} : memref<?x?xf32>, vector<4x8xf32>
+ // CHECK-NEXT: %[[E:.*]] = arith.addf %[[D]], %[[C]] : vector<4x8xf32>
+ // CHECK: vector.transfer_write %[[E]], %{{.*}}[%[[I2]], %[[I3]]] : vector<4x8xf32>, memref<?x?xf32>
+ affine.for %i2 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%M) {
+ affine.for %i3 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%N) {
+ affine.for %i4 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%K) {
+ %6 = affine.load %arg1[%i4, %i3] : memref<?x?xf32>
+ %7 = affine.load %arg0[%i2, %i4] : memref<?x?xf32>
+ %8 = arith.mulf %7, %6 : f32
+ %9 = affine.load %arg2[%i2, %i3] : memref<?x?xf32>
+ %10 = arith.addf %9, %8 : f32
+ affine.store %10, %arg2[%i2, %i3] : memref<?x?xf32>
+ }
+ }
+ }
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.affine.super_vectorize %0 [4, 8] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @vec3d(%A : memref<?x?x?xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %0 = memref.dim %A, %c0 : memref<?x?x?xf32>
+ %1 = memref.dim %A, %c1 : memref<?x?x?xf32>
+ %2 = memref.dim %A, %c2 : memref<?x?x?xf32>
+ // CHECK: affine.for %{{.*}} = 0 to %{{.*}} {
+ // CHECK: affine.for %{{.*}} = 0 to %{{.*}} {
+ // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 32 {
+ // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 64 {
+ // CHECK: affine.for %{{.*}} = 0 to %{{.*}} step 256 {
+ // CHECK: %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} : memref<?x?x?xf32>, vector<32x64x256xf32>
+ affine.for %t0 = 0 to %0 {
+ affine.for %t1 = 0 to %0 {
+ affine.for %i0 = 0 to %0 {
+ affine.for %i1 = 0 to %1 {
+ affine.for %i2 = 0 to %2 {
+ %a2 = affine.load %A[%i0, %i1, %i2] : memref<?x?x?xf32>
+ }
+ }
+ }
+ }
+ }
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.affine.super_vectorize %0 [32, 64, 256] fastest_varying_pattern=[2,1,0] : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+// CHECK-LABEL: @vecdim_reduction_minf
+// CHECK: affine.for %{{.*}} = 0 to 256 {
+// CHECK: %[[vmax:.*]] = arith.constant dense<0x7F800000> : vector<128xf32>
+// CHECK: %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vmax]]) -> (vector<128xf32>) {
+// CHECK: %[[ld:.*]] = vector.transfer_read %{{.*}} : memref<256x512xf32>, vector<128xf32>
+// CHECK: %[[min:.*]] = arith.minimumf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK: affine.yield %[[min]] : vector<128xf32>
+// CHECK: }
+// CHECK: %[[final_min:.*]] = vector.reduction <minimumf>, %[[vred:.*]] : vector<128xf32> into f32
+// CHECK: affine.store %[[final_min]], %{{.*}} : memref<256xf32>
+// CHECK: }
+
+func.func @vecdim_reduction_minf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = arith.constant 0x7F800000 : f32
+ affine.for %i = 0 to 256 {
+ %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+ %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+ %min = arith.minimumf %red_iter, %ld : f32
+ affine.yield %min : f32
+ }
+ affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["affine.for"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+ transform.affine.super_vectorize %0 [128] vectorize_reductions=true : !transform.any_op
+ transform.yield
+ }
+}
\ No newline at end of file
More information about the Mlir-commits
mailing list