[Mlir-commits] [mlir] [mlir][x86vector] Sink Vector.transfer_reads and vector.load before the consumer (PR #169333)
Arun Thangamani
llvmlistbot at llvm.org
Tue Dec 9 21:09:32 PST 2025
https://github.com/arun-thmn updated https://github.com/llvm/llvm-project/pull/169333
>From 95260b8b1a22adfe4f0412f67b8fedea2d200294 Mon Sep 17 00:00:00 2001
From: Arun Thangamani <arun.thangamani at intel.com>
Date: Mon, 24 Nov 2025 06:32:47 -0800
Subject: [PATCH 1/7] sink vector transfer reads and loads before the consumer
---
.../TransformOps/X86VectorTransformOps.td | 11 ++
.../mlir/Dialect/X86Vector/Transforms.h | 4 +
.../TransformOps/X86VectorTransformOps.cpp | 5 +
.../X86Vector/Transforms/CMakeLists.txt | 1 +
.../Transforms/SinkVectorProducerOps.cpp | 87 ++++++++++
.../X86Vector/sink-vector-producer-ops.mlir | 154 ++++++++++++++++++
6 files changed, 262 insertions(+)
create mode 100644 mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
create mode 100644 mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir
diff --git a/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td
index 3c5294ff14fc7..12ba5e9f11141 100644
--- a/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td
+++ b/mlir/include/mlir/Dialect/X86Vector/TransformOps/X86VectorTransformOps.td
@@ -38,6 +38,17 @@ def ApplyVectorContractToPackedTypeDotProductPatternsOp : Op<Transform_Dialect,
let assemblyFormat = "attr-dict";
}
+def ApplySinkVectorProducerOpsPatternsOp : Op<Transform_Dialect,
+ "apply_patterns.x86vector.sink_vector_producer_ops",
+ [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
+ let description = [{
+ Collect patterns to sink vector producer operations forward in a block to
+ place them immediately before their first use.
+ }];
+
+ let assemblyFormat = "attr-dict";
+}
+
#endif // X86VECTOR_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/X86Vector/Transforms.h b/mlir/include/mlir/Dialect/X86Vector/Transforms.h
index fc46dff63c2b7..b9c9054f57890 100644
--- a/mlir/include/mlir/Dialect/X86Vector/Transforms.h
+++ b/mlir/include/mlir/Dialect/X86Vector/Transforms.h
@@ -91,6 +91,10 @@ void populateVectorContractToFMAPatterns(RewritePatternSet &patterns);
void populateVectorContractToPackedTypeDotProductPatterns(
RewritePatternSet &patterns);
+// Performs forward scheduling of vector producer ops to minimize their live
+// range by placing them at their earliest legal use site
+void populateSinkVectorProducerOpsPatterns(RewritePatternSet &patterns);
+
//===----------------------------------------------------------------------===//
/// Helpers extracted from:
/// - clang/lib/Headers/avxintrin.h
diff --git a/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp b/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp
index 95db208207672..25772f2aa57f4 100644
--- a/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp
+++ b/mlir/lib/Dialect/X86Vector/TransformOps/X86VectorTransformOps.cpp
@@ -32,6 +32,11 @@ void mlir::transform::ApplyVectorContractToPackedTypeDotProductPatternsOp::
x86vector::populateVectorContractToPackedTypeDotProductPatterns(patterns);
}
+void mlir::transform::ApplySinkVectorProducerOpsPatternsOp::populatePatterns(
+ RewritePatternSet &patterns) {
+ x86vector::populateSinkVectorProducerOpsPatterns(patterns);
+}
+
//===----------------------------------------------------------------------===//
// Transform op registration
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt
index 2cab50fb591c4..cc4d3cac0f7ea 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/X86Vector/Transforms/CMakeLists.txt
@@ -3,6 +3,7 @@ add_mlir_dialect_library(MLIRX86VectorTransforms
LegalizeForLLVMExport.cpp
VectorContractToFMA.cpp
VectorContractToPackedTypeDotProduct.cpp
+ SinkVectorProducerOps.cpp
LINK_LIBS PUBLIC
MLIRArithDialect
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
new file mode 100644
index 0000000000000..85cb18687a4fc
--- /dev/null
+++ b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
@@ -0,0 +1,87 @@
+//===- SinkVectorProducerOps.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
+#include "mlir/Dialect/X86Vector/Transforms.h"
+#include "mlir/Dialect/X86Vector/X86VectorDialect.h"
+
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/PatternMatch.h"
+
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace mlir::vector;
+using namespace mlir::x86vector;
+
+/// Sink vector producers forward to reduce live ranges.
+/// This pattern applies to ops such as vector.load and vector.transfer_read.
+template <typename producerOp>
+struct SinkVectorProducerOps final : public OpRewritePattern<producerOp> {
+ using OpRewritePattern<producerOp>::OpRewritePattern;
+
+ LogicalResult matchAndRewrite(producerOp op,
+ PatternRewriter &rewriter) const override {
+
+ // Collect all users of the producer op.
+ llvm::SmallVector<Operation *> users;
+ for (OpResult result : op->getResults())
+ for (Operation *user : result.getUsers())
+ users.push_back(user);
+
+ // If there are no users, nothing to sink.
+ if (users.empty())
+ return failure();
+
+ // If the next op is already a user, do not move.
+ Operation *nextOp = op->getNextNode();
+ if (llvm::is_contained(users, nextOp))
+ return failure();
+
+ // Prevent pathological looping:
+ // If the next op produces values used by any of op's users, don't move.
+ llvm::SmallVector<Operation *> nextOpUsers;
+ for (OpResult result : nextOp->getResults())
+ for (Operation *user : result.getUsers())
+ nextOpUsers.push_back(user);
+ if (llvm::any_of(users, [&](Operation *x) {
+ return llvm::is_contained(nextOpUsers, x);
+ })) {
+ return failure();
+ }
+
+ // Find the nearest user by scanning forward.
+ while (nextOp) {
+ if (llvm::is_contained(users, nextOp))
+ break;
+
+ nextOp = nextOp->getNextNode();
+ }
+
+ if (!nextOp)
+ return failure();
+
+ // // Both ops must be in the same block to safely move.
+ if (op->getBlock() != nextOp->getBlock())
+ return failure();
+
+ // Move producer immediately before its first user.
+ op->moveBefore(nextOp);
+
+ return success();
+ }
+};
+
+void x86vector::populateSinkVectorProducerOpsPatterns(
+ RewritePatternSet &patterns) {
+ patterns.add<SinkVectorProducerOps<vector::TransferReadOp>,
+ SinkVectorProducerOps<vector::LoadOp>>(patterns.getContext());
+}
diff --git a/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir b/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir
new file mode 100644
index 0000000000000..04045b05bda49
--- /dev/null
+++ b/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir
@@ -0,0 +1,154 @@
+// RUN: mlir-opt %s -transform-interpreter -cse -split-input-file | FileCheck %s
+
+func.func @sink_vector_loads(%arg0: memref<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> {
+ %c0 = arith.constant 0 : index
+ %c8 = arith.constant 8 : index
+ %0 = vector.load %arg0[%c0, %c0] : memref<16x16xf32>, vector<8xf32>
+ %1 = vector.load %arg0[%c0, %c8] : memref<16x16xf32>, vector<8xf32>
+ %2 = vector.load %arg0[%c8, %c0] : memref<16x16xf32>, vector<8xf32>
+ %3 = vector.load %arg0[%c8, %c8] : memref<16x16xf32>, vector<8xf32>
+ %4 = vector.fma %0, %1, %arg1 : vector<8xf32>
+ %5 = vector.fma %2, %3, %4 : vector<8xf32>
+ return %5 : vector<8xf32>
+}
+
+// CHECK-LABEL: @sink_vector_loads
+// CHECK: vector.load
+// CHECK-NEXT: vector.load
+// CHECK-NEXT: vector.fma
+// CHECK-NEXT: vector.load
+// CHECK-NEXT: vector.load
+// CHECK-NEXT: vector.fma
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %0 {
+ transform.apply_patterns.x86vector.sink_vector_producer_ops
+ } : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @sink_vector_transfer_reads(%arg0: memref<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> {
+ %c0 = arith.constant 0 : index
+ %c8 = arith.constant 8 : index
+ %0 = ub.poison : f32
+ %1 = vector.transfer_read %arg0[%c0, %c0], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32>
+ %2 = vector.transfer_read %arg0[%c0, %c8], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32>
+ %3 = vector.transfer_read %arg0[%c8, %c0], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32>
+ %4 = vector.transfer_read %arg0[%c8, %c8], %0 {in_bounds = [true]} : memref<16x16xf32>, vector<8xf32>
+ %5 = vector.fma %1, %2, %arg1 : vector<8xf32>
+ %6 = vector.fma %3, %4, %5 : vector<8xf32>
+ return %6 : vector<8xf32>
+}
+
+// CHECK-LABEL: @sink_vector_transfer_reads
+// CHECK: vector.transfer_read
+// CHECK-NEXT: vector.transfer_read
+// CHECK-NEXT: vector.fma
+// CHECK-NEXT: vector.transfer_read
+// CHECK-NEXT: vector.transfer_read
+// CHECK-NEXT: vector.fma
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %0 {
+ transform.apply_patterns.x86vector.sink_vector_producer_ops
+ } : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @sink_vector_transfer_reads_tensor(%arg0: tensor<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> {
+ %c0 = arith.constant 0 : index
+ %c8 = arith.constant 8 : index
+ %0 = ub.poison : f32
+ %1 = vector.transfer_read %arg0[%c0, %c0], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32>
+ %2 = vector.transfer_read %arg0[%c0, %c8], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32>
+ %3 = vector.transfer_read %arg0[%c8, %c0], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32>
+ %4 = vector.transfer_read %arg0[%c8, %c8], %0 {in_bounds = [true]} : tensor<16x16xf32>, vector<8xf32>
+ %5 = vector.fma %1, %2, %arg1 : vector<8xf32>
+ %6 = vector.fma %3, %4, %5 : vector<8xf32>
+ return %6 : vector<8xf32>
+}
+
+// CHECK-LABEL: @sink_vector_transfer_reads_tensor
+// CHECK: vector.transfer_read
+// CHECK-NEXT: vector.transfer_read
+// CHECK-NEXT: vector.fma
+// CHECK-NEXT: vector.transfer_read
+// CHECK-NEXT: vector.transfer_read
+// CHECK-NEXT: vector.fma
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %0 {
+ transform.apply_patterns.x86vector.sink_vector_producer_ops
+ } : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @negative_no_infinite_looping(%arg0: memref<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> {
+ %c0 = arith.constant 0 : index
+ %c8 = arith.constant 8 : index
+ %0 = vector.load %arg0[%c0, %c0] : memref<16x16xf32>, vector<8xf32>
+ %1 = vector.load %arg0[%c0, %c8] : memref<16x16xf32>, vector<8xf32>
+ %2 = vector.fma %0, %1, %arg1 : vector<8xf32>
+ return %2: vector<8xf32>
+}
+
+// CHECK-LABEL: @negative_no_infinite_looping
+// CHECK: vector.load
+// CHECK-NEXT: vector.load
+// CHECK-NEXT: vector.fma
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %0 {
+ transform.apply_patterns.x86vector.sink_vector_producer_ops
+ } : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
+func.func @negative_no_sink_outside_block(%arg0: memref<8x16xf32>, %arg1: i1) -> vector<8xf32> {
+ %c0 = arith.constant 0 : index
+ %c8 = arith.constant 8 : index
+ %0 = vector.load %arg0[%c0, %c0] : memref<8x16xf32>, vector<8xf32>
+ %1 = vector.load %arg0[%c0, %c8] : memref<8x16xf32>, vector<8xf32>
+ %2 = scf.if %arg1 -> (vector<8xf32>) {
+ scf.yield %0 : vector<8xf32>
+ } else {
+ scf.yield %1 : vector<8xf32>
+ }
+ return %2 : vector<8xf32>
+}
+
+// CHECK-LABEL: @negative_no_sink_outside_block
+// CHECK: vector.load
+// CHECK-NEXT: vector.load
+// CHECK-NEXT: scf.if
+// CHECK-NEXT: scf.yield
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %0 {
+ transform.apply_patterns.x86vector.sink_vector_producer_ops
+ } : !transform.any_op
+ transform.yield
+ }
+}
>From e3e18a74d387b8bb789eb0fb723586d5d308388f Mon Sep 17 00:00:00 2001
From: Arun Thangamani <arun.thangamani at intel.com>
Date: Mon, 24 Nov 2025 07:34:18 -0800
Subject: [PATCH 2/7] added a bf16 test-case
---
.../Transforms/SinkVectorProducerOps.cpp | 15 +++++--
.../X86Vector/sink-vector-producer-ops.mlir | 45 +++++++++++++++++++
2 files changed, 56 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
index 85cb18687a4fc..eb60e3e21d515 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
+++ b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
@@ -52,12 +52,19 @@ struct SinkVectorProducerOps final : public OpRewritePattern<producerOp> {
for (OpResult result : nextOp->getResults())
for (Operation *user : result.getUsers())
nextOpUsers.push_back(user);
- if (llvm::any_of(users, [&](Operation *x) {
- return llvm::is_contained(nextOpUsers, x);
- })) {
- return failure();
+
+ Operation *nextFirstUser = nextOp->getNextNode();
+ while (nextFirstUser) {
+ if (llvm::is_contained(nextOpUsers, nextFirstUser))
+ break;
+
+ nextFirstUser = nextFirstUser->getNextNode();
}
+ if (llvm::is_contained(users, nextFirstUser))
+ return failure();
+
+
// Find the nearest user by scanning forward.
while (nextOp) {
if (llvm::is_contained(users, nextOp))
diff --git a/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir b/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir
index 04045b05bda49..11af315e69e66 100644
--- a/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir
+++ b/mlir/test/Dialect/X86Vector/sink-vector-producer-ops.mlir
@@ -98,6 +98,50 @@ module attributes {transform.with_named_sequence} {
// -----
+#map = affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4, d1)>
+#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3, d1)>
+#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3)>
+
+func.func @sink_vector_transfer_reads_bf16(%arg0: tensor<4x64x32x2xbf16>, %arg1: tensor<4x32x64x2xbf16>, %arg2: vector<1x16xf32>) -> vector<1x16xf32> {
+ %0 = ub.poison : bf16
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c16 = arith.constant 16 : index
+ %extracted_slice = tensor.extract_slice %arg0[%c0, %c0, %c0, 0] [1, 4, 1, 2] [1, 1, 1, 1] : tensor<4x64x32x2xbf16> to tensor<1x4x1x2xbf16>
+ %extracted_slice_0 = tensor.extract_slice %arg1[%c0, %c0, %c0, 0] [1, 1, 32, 2] [1, 1, 1, 1] : tensor<4x32x64x2xbf16> to tensor<1x1x32x2xbf16>
+ %1 = vector.transfer_read %extracted_slice[%c0, %c0, %c0, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x2xbf16>, vector<1x1x1x2xbf16>
+ %2 = vector.transfer_read %extracted_slice[%c0, %c1, %c0, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x4x1x2xbf16>, vector<1x1x1x2xbf16>
+ %3 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c0, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x1x32x2xbf16>, vector<1x1x16x2xbf16>
+ %4 = vector.transfer_read %extracted_slice_0[%c0, %c0, %c16, %c0], %0 {in_bounds = [true, true, true, true]} : tensor<1x1x32x2xbf16>, vector<1x1x16x2xbf16>
+ %5 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %1, %3, %arg2 {unroll_shape = array<i64: 1, 2, 1, 16, 1>} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32>
+ %6 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %1, %4, %5 {unroll_shape = array<i64: 1, 2, 1, 16, 1>} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32>
+ %7 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %2, %3, %6 {unroll_shape = array<i64: 1, 2, 1, 16, 1>} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32>
+ %8 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %2, %4, %7 {unroll_shape = array<i64: 1, 2, 1, 16, 1>} : vector<1x1x1x2xbf16>, vector<1x1x16x2xbf16> into vector<1x16xf32>
+ return %8 : vector<1x16xf32>
+}
+
+// CHECK-LABEL: @sink_vector_transfer_reads_bf16
+// CHECK: vector.transfer_read
+// CHECK-NEXT: vector.transfer_read
+// CHECK-NEXT: vector.contract
+// CHECK-NEXT: vector.transfer_read
+// CHECK-NEXT: vector.contract
+// CHECK-NEXT: vector.transfer_read
+// CHECK-NEXT: vector.contract
+// CHECK-NEXT: vector.contract
+
+module attributes {transform.with_named_sequence} {
+ transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %0 {
+ transform.apply_patterns.x86vector.sink_vector_producer_ops
+ } : !transform.any_op
+ transform.yield
+ }
+}
+
+// -----
+
func.func @negative_no_infinite_looping(%arg0: memref<16x16xf32>, %arg1: vector<8xf32>) -> vector<8xf32> {
%c0 = arith.constant 0 : index
%c8 = arith.constant 8 : index
@@ -152,3 +196,4 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}
+
>From 0a1f4233c4363f5f5ee4928629e198a23e0a8e6a Mon Sep 17 00:00:00 2001
From: Arun Thangamani <arun.thangamani at intel.com>
Date: Mon, 24 Nov 2025 07:38:11 -0800
Subject: [PATCH 3/7] fic clang-format errors
---
mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
index eb60e3e21d515..b31636958e158 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
+++ b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
@@ -64,7 +64,6 @@ struct SinkVectorProducerOps final : public OpRewritePattern<producerOp> {
if (llvm::is_contained(users, nextFirstUser))
return failure();
-
// Find the nearest user by scanning forward.
while (nextOp) {
if (llvm::is_contained(users, nextOp))
>From 2bd09d8ba1feedec339d94cb1e727058f209de3f Mon Sep 17 00:00:00 2001
From: Arun Thangamani <arun.thangamani at intel.com>
Date: Mon, 24 Nov 2025 20:12:26 -0800
Subject: [PATCH 4/7] validate based on first user
---
.../X86Vector/Transforms/SinkVectorProducerOps.cpp | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
index b31636958e158..95d970c404b70 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
+++ b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
@@ -61,9 +61,6 @@ struct SinkVectorProducerOps final : public OpRewritePattern<producerOp> {
nextFirstUser = nextFirstUser->getNextNode();
}
- if (llvm::is_contained(users, nextFirstUser))
- return failure();
-
// Find the nearest user by scanning forward.
while (nextOp) {
if (llvm::is_contained(users, nextOp))
@@ -75,7 +72,12 @@ struct SinkVectorProducerOps final : public OpRewritePattern<producerOp> {
if (!nextOp)
return failure();
- // // Both ops must be in the same block to safely move.
+ // The Op first user and next Op first user are same. Break here to
+ // to avoid the shift cycle looping.
+ if (nextOp == nextFirstUser)
+ return failure();
+
+ // Both ops must be in the same block to safely move.
if (op->getBlock() != nextOp->getBlock())
return failure();
>From 77b8b3848cce6af1df1984cae43cff4fb8a8e27c Mon Sep 17 00:00:00 2001
From: Arun Thangamani <arun.thangamani at intel.com>
Date: Fri, 28 Nov 2025 07:28:37 -0800
Subject: [PATCH 5/7] code re-factor
---
.../Transforms/SinkVectorProducerOps.cpp | 50 ++++++++++++-------
1 file changed, 33 insertions(+), 17 deletions(-)
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
index 95d970c404b70..6a6327a13b946 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
+++ b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
@@ -32,57 +32,73 @@ struct SinkVectorProducerOps final : public OpRewritePattern<producerOp> {
PatternRewriter &rewriter) const override {
// Collect all users of the producer op.
- llvm::SmallVector<Operation *> users;
+ llvm::SmallVector<Operation *> opUsers;
for (OpResult result : op->getResults())
for (Operation *user : result.getUsers())
- users.push_back(user);
+ opUsers.push_back(user);
// If there are no users, nothing to sink.
- if (users.empty())
+ if (opUsers.empty())
return failure();
// If the next op is already a user, do not move.
Operation *nextOp = op->getNextNode();
- if (llvm::is_contained(users, nextOp))
+ if (llvm::is_contained(opUsers, nextOp))
return failure();
// Prevent pathological looping:
- // If the next op produces values used by any of op's users, don't move.
+ // If two producers are used by same consumer, will end in looping of
+ // moving the producers.
+ // For example:
+ // %1 = prod1
+ // %2 = prod2
+ // %3 = op %1, %2
llvm::SmallVector<Operation *> nextOpUsers;
for (OpResult result : nextOp->getResults())
for (Operation *user : result.getUsers())
nextOpUsers.push_back(user);
- Operation *nextFirstUser = nextOp->getNextNode();
- while (nextFirstUser) {
- if (llvm::is_contained(nextOpUsers, nextFirstUser))
+ // Both producers have one same users.
+ if (opUsers.size() == 1 && nextOpUsers.size() != 1 &&
+ llvm::is_contained(opUsers, nextOpUsers.front()))
+ return failure();
+
+ // Get the first user of both the current and next operation.
+ Operation *opFirstUser = op->getNextNode();
+ Operation *nextOpFirstUser = op->getNextNode();
+
+ while (opFirstUser) {
+ if (llvm::is_contained(opUsers, opFirstUser))
break;
- nextFirstUser = nextFirstUser->getNextNode();
+ opFirstUser = opFirstUser->getNextNode();
}
- // Find the nearest user by scanning forward.
- while (nextOp) {
- if (llvm::is_contained(users, nextOp))
+ while (nextOpFirstUser) {
+ if (llvm::is_contained(nextOpUsers, nextOpFirstUser))
break;
- nextOp = nextOp->getNextNode();
+ nextOpFirstUser = nextOpFirstUser->getNextNode();
}
- if (!nextOp)
+ if (!opFirstUser)
return failure();
// The Op first user and next Op first user are same. Break here to
// to avoid the shift cycle looping.
- if (nextOp == nextFirstUser)
+ if (opFirstUser == nextOpFirstUser)
return failure();
// Both ops must be in the same block to safely move.
- if (op->getBlock() != nextOp->getBlock())
+ if (op->getBlock() != opFirstUser->getBlock())
return failure();
// Move producer immediately before its first user.
- op->moveBefore(nextOp);
+ op->moveBefore(opFirstUser);
+
+ // Move the nextOp to its first user
+ if (nextOpFirstUser && (nextOpFirstUser->getBlock() == nextOp->getBlock()))
+ nextOp->moveBefore(nextOpFirstUser);
return success();
}
>From 9b96ad3445d4e54bb3bd46958aa2c6fcbc91f3b1 Mon Sep 17 00:00:00 2001
From: Arun Thangamani <arun.thangamani at intel.com>
Date: Sun, 7 Dec 2025 21:39:20 -0800
Subject: [PATCH 6/7] code re-factor: move prod before consumer in the first
pass
---
.../Transforms/SinkVectorProducerOps.cpp | 151 +++++++++++-------
1 file changed, 94 insertions(+), 57 deletions(-)
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
index 6a6327a13b946..b765116b94244 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
+++ b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
@@ -22,6 +22,58 @@ using namespace mlir;
using namespace mlir::vector;
using namespace mlir::x86vector;
+static FailureOr<llvm::SmallVector<Operation *>> getAllUsers(Operation *op) {
+ llvm::SmallVector<Operation *> opUsers;
+ for (OpResult result : op->getResults()) {
+ for (Operation *user : result.getUsers()) {
+ // Check prod and users belongs to same block.
+ if (op->getBlock() != user->getBlock())
+ return failure();
+ opUsers.push_back(user);
+ }
+ }
+
+ return opUsers;
+}
+
+// Prevent pathological looping:
+// If two/three producers are used by same consumer, will end in looping of
+// moving the producers.
+// For example:
+// %1 = prod1
+// %2 = prod2
+// %3 = prod3
+// %4 = op %1, %2, %3
+static bool checkLooping(Operation *op) {
+ llvm::SmallVector<Operation *> operations;
+ operations.push_back(op);
+
+ // Retrive the next immediate two/three operation until it is a vector.load or
+ // a vector.transfer_read
+ Operation *nextOp = op->getNextNode();
+ while (operations.size() < 3 && nextOp) {
+ if (isa<vector::LoadOp>(nextOp) || isa<vector::TransferReadOp>(nextOp)) {
+ operations.push_back(op);
+ } else {
+ break;
+ }
+ nextOp = nextOp->getNextNode();
+ }
+
+ // If all the loads or transfer_reads have same immediate nextOp as its
+ // user, then it loops.
+ for (Operation *op : operations) {
+ FailureOr<llvm::SmallVector<Operation *>> users = getAllUsers(op);
+ if (failed(users))
+ return false;
+
+ if (!llvm::is_contained(*users, nextOp))
+ return false;
+ }
+
+ return true;
+}
+
/// Sink vector producers forward to reduce live ranges.
/// This pattern applies to ops such as vector.load and vector.transfer_read.
template <typename producerOp>
@@ -31,75 +83,60 @@ struct SinkVectorProducerOps final : public OpRewritePattern<producerOp> {
LogicalResult matchAndRewrite(producerOp op,
PatternRewriter &rewriter) const override {
- // Collect all users of the producer op.
- llvm::SmallVector<Operation *> opUsers;
- for (OpResult result : op->getResults())
- for (Operation *user : result.getUsers())
- opUsers.push_back(user);
-
- // If there are no users, nothing to sink.
- if (opUsers.empty())
+ auto users = getAllUsers(op);
+ if (failed(users))
return failure();
- // If the next op is already a user, do not move.
- Operation *nextOp = op->getNextNode();
- if (llvm::is_contained(opUsers, nextOp))
+ if (checkLooping(op))
return failure();
- // Prevent pathological looping:
- // If two producers are used by same consumer, will end in looping of
- // moving the producers.
- // For example:
- // %1 = prod1
- // %2 = prod2
- // %3 = op %1, %2
- llvm::SmallVector<Operation *> nextOpUsers;
- for (OpResult result : nextOp->getResults())
- for (Operation *user : result.getUsers())
- nextOpUsers.push_back(user);
-
- // Both producers have one same users.
- if (opUsers.size() == 1 && nextOpUsers.size() != 1 &&
- llvm::is_contained(opUsers, nextOpUsers.front()))
- return failure();
+ llvm::DenseMap<Operation *, llvm::SmallVector<Operation *>> prodsAllUsers;
+ llvm::DenseMap<Operation *, Operation *> prodsFirstUser;
- // Get the first user of both the current and next operation.
- Operation *opFirstUser = op->getNextNode();
- Operation *nextOpFirstUser = op->getNextNode();
+ llvm::SmallVector<Operation *> opUsers = *users;
+ prodsAllUsers.try_emplace(op, opUsers);
- while (opFirstUser) {
- if (llvm::is_contained(opUsers, opFirstUser))
- break;
+ // Iterate until the last instruction to find the first users of all
+ // producers within the block.
+ Operation *nextOp = op->getNextNode();
- opFirstUser = opFirstUser->getNextNode();
+ while (nextOp) {
+
+ if (isa<vector::LoadOp>(nextOp) || isa<vector::TransferReadOp>(nextOp)) {
+ auto nextUsers = getAllUsers(nextOp);
+
+ if (failed(nextUsers))
+ continue;
+ llvm::SmallVector<Operation *> nextOpUsers = *nextUsers;
+ prodsAllUsers.try_emplace(nextOp, nextOpUsers);
+ } else {
+ llvm::SmallVector<Operation *> operations;
+
+ for (auto &entry : prodsAllUsers) {
+ llvm::SmallVector<Operation *> &users = entry.second;
+
+ if (llvm::is_contained(users, nextOp)) {
+ Operation *operation = entry.first;
+ operations.push_back(operation);
+ prodsFirstUser.try_emplace(operation, nextOp);
+ }
+ }
+
+ for (Operation *op : operations) {
+ prodsAllUsers.erase(op);
+ }
+ }
+ nextOp = nextOp->getNextNode();
}
- while (nextOpFirstUser) {
- if (llvm::is_contained(nextOpUsers, nextOpFirstUser))
- break;
+ // Move all the loads or transfer_reads before its first use.
+ for (auto &entry : prodsFirstUser) {
+ Operation *prod = entry.first;
+ Operation *consumer = entry.second;
- nextOpFirstUser = nextOpFirstUser->getNextNode();
+ prod->moveBefore(consumer);
}
- if (!opFirstUser)
- return failure();
-
- // The Op first user and next Op first user are same. Break here to
- // to avoid the shift cycle looping.
- if (opFirstUser == nextOpFirstUser)
- return failure();
-
- // Both ops must be in the same block to safely move.
- if (op->getBlock() != opFirstUser->getBlock())
- return failure();
-
- // Move producer immediately before its first user.
- op->moveBefore(opFirstUser);
-
- // Move the nextOp to its first user
- if (nextOpFirstUser && (nextOpFirstUser->getBlock() == nextOp->getBlock()))
- nextOp->moveBefore(nextOpFirstUser);
-
return success();
}
};
>From 70f4f78c4ab67c4c65b761f82a100ced6f3489d3 Mon Sep 17 00:00:00 2001
From: Arun Thangamani <arun.thangamani at intel.com>
Date: Tue, 9 Dec 2025 21:08:56 -0800
Subject: [PATCH 7/7] fix a missing loop-hole in while case + proper variable
name
---
.../Transforms/SinkVectorProducerOps.cpp | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
index b765116b94244..e5a8d619f7731 100644
--- a/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
+++ b/mlir/lib/Dialect/X86Vector/Transforms/SinkVectorProducerOps.cpp
@@ -22,7 +22,8 @@ using namespace mlir;
using namespace mlir::vector;
using namespace mlir::x86vector;
-static FailureOr<llvm::SmallVector<Operation *>> getAllUsers(Operation *op) {
+static FailureOr<llvm::SmallVector<Operation *>>
+getSameBlockUsers(Operation *op) {
llvm::SmallVector<Operation *> opUsers;
for (OpResult result : op->getResults()) {
for (Operation *user : result.getUsers()) {
@@ -48,10 +49,10 @@ static bool checkLooping(Operation *op) {
llvm::SmallVector<Operation *> operations;
operations.push_back(op);
- // Retrive the next immediate two/three operation until it is a vector.load or
+ // Retrive the next immediate operation until it is a vector.load or
// a vector.transfer_read
Operation *nextOp = op->getNextNode();
- while (operations.size() < 3 && nextOp) {
+ while (nextOp) {
if (isa<vector::LoadOp>(nextOp) || isa<vector::TransferReadOp>(nextOp)) {
operations.push_back(op);
} else {
@@ -63,7 +64,7 @@ static bool checkLooping(Operation *op) {
// If all the loads or transfer_reads have same immediate nextOp as its
// user, then it loops.
for (Operation *op : operations) {
- FailureOr<llvm::SmallVector<Operation *>> users = getAllUsers(op);
+ FailureOr<llvm::SmallVector<Operation *>> users = getSameBlockUsers(op);
if (failed(users))
return false;
@@ -83,7 +84,7 @@ struct SinkVectorProducerOps final : public OpRewritePattern<producerOp> {
LogicalResult matchAndRewrite(producerOp op,
PatternRewriter &rewriter) const override {
- auto users = getAllUsers(op);
+ auto users = getSameBlockUsers(op);
if (failed(users))
return failure();
@@ -98,12 +99,12 @@ struct SinkVectorProducerOps final : public OpRewritePattern<producerOp> {
// Iterate until the last instruction to find the first users of all
// producers within the block.
- Operation *nextOp = op->getNextNode();
+ Operation *nextOp = op;
- while (nextOp) {
+ while ((nextOp = nextOp->getNextNode())) {
if (isa<vector::LoadOp>(nextOp) || isa<vector::TransferReadOp>(nextOp)) {
- auto nextUsers = getAllUsers(nextOp);
+ auto nextUsers = getSameBlockUsers(nextOp);
if (failed(nextUsers))
continue;
@@ -126,7 +127,6 @@ struct SinkVectorProducerOps final : public OpRewritePattern<producerOp> {
prodsAllUsers.erase(op);
}
}
- nextOp = nextOp->getNextNode();
}
// Move all the loads or transfer_reads before its first use.
More information about the Mlir-commits
mailing list