[Mlir-commits] [mlir] [mlir] Add pooling ncw vectorization support without needing to transpose to nwc (PR #193208)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Tue Apr 21 05:59:42 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-linalg
Author: Nouman Amir (NoumanAmir657)
<details>
<summary>Changes</summary>
Currently for 1D pooling, in the case of ncw, the vectorization transposes ncw to nwc and then vectorizes it. That leads to `vector.transpose` being introduced which eventually leads to a expensive permutation instruction being emitted for an architecture like RVV. In order to avoid that, a direct lowering for pooling ncw is made in this PR.
---
Full diff: https://github.com/llvm/llvm-project/pull/193208.diff
2 Files Affected:
- (modified) mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp (+49-11)
- (modified) mlir/test/Dialect/Linalg/vectorization/convolution-with-patterns.mlir (+30-36)
``````````diff
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index b57e66a1c3580..fc1bc8e55cbbe 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -107,11 +107,10 @@ static OpType getSingleOpOfType(Block &block) {
/// Helper function to extract the input slices after filter is unrolled along
/// kw.
-static SmallVector<Value>
-extractConvInputSlices(RewriterBase &rewriter, Location loc, Value input,
- int64_t nSize, int64_t wSize, int64_t cSize,
- int64_t kwSize, int strideW, int dilationW,
- int64_t wSizeStep, bool isSingleChanneled) {
+static SmallVector<Value> extractConvInputSlices(
+ RewriterBase &rewriter, Location loc, Value input, int64_t nSize,
+ int64_t wSize, int64_t cSize, int64_t kwSize, int strideW, int dilationW,
+ int64_t wSizeStep, bool isSingleChanneled, bool isNCWPooling = false) {
SmallVector<Value> result;
if (isSingleChanneled) {
// Extract input slice of size {wSizeStep} @ [w + kw] for non-channeled
@@ -125,6 +124,19 @@ extractConvInputSlices(RewriterBase &rewriter, Location loc, Value input,
strides));
}
}
+ } else if (isNCWPooling) {
+ // Extract lhs slice of size {n, c, wSizeStep} @ [0, 0, sw * w + dw * kw]
+ // for NCW pooling.
+ SmallVector<int64_t> sizes = {nSize, cSize, wSizeStep};
+ SmallVector<int64_t> strides = {1, 1, 1};
+ for (int64_t kw = 0; kw < kwSize; ++kw) {
+ for (int64_t w = 0; w < wSize; w += wSizeStep) {
+ result.push_back(vector::ExtractStridedSliceOp::create(
+ rewriter, loc, input,
+ /*offsets=*/ArrayRef<int64_t>{0, 0, w * strideW + kw * dilationW},
+ sizes, strides));
+ }
+ }
} else {
// Extract lhs slice of size {n, wSizeStep, c} @ [0, sw * w + dw * kw, 0]
// for channeled convolution.
@@ -162,7 +174,8 @@ static SmallVector<Value> extractConvFilterSlices(RewriterBase &rewriter,
static SmallVector<Value>
extractConvResultSlices(RewriterBase &rewriter, Location loc, Value res,
int64_t nSize, int64_t wSize, int64_t fSize,
- int64_t wSizeStep, bool isSingleChanneled) {
+ int64_t wSizeStep, bool isSingleChanneled,
+ bool isNCWPooling = false) {
SmallVector<Value> result;
if (isSingleChanneled) {
// Extract res slice: {wSizeStep} @ [w] for non-channeled convolution.
@@ -173,6 +186,15 @@ extractConvResultSlices(RewriterBase &rewriter, Location loc, Value res,
rewriter, loc, res, /*offsets=*/ArrayRef<int64_t>{w}, sizes,
strides));
}
+ } else if (isNCWPooling) {
+ // Extract res slice: {n, f, wSizeStep} @ [0, 0, w] for NCW pooling.
+ SmallVector<int64_t> sizes = {nSize, fSize, wSizeStep};
+ SmallVector<int64_t> strides = {1, 1, 1};
+ for (int64_t w = 0; w < wSize; w += wSizeStep) {
+ result.push_back(vector::ExtractStridedSliceOp::create(
+ rewriter, loc, res, /*offsets=*/ArrayRef<int64_t>{0, 0, w}, sizes,
+ strides));
+ }
} else {
// Extract res slice: {n, wSizeStep, f} @ [0, w, 0] for channeled
// convolution.
@@ -191,7 +213,8 @@ extractConvResultSlices(RewriterBase &rewriter, Location loc, Value res,
static Value insertConvResultSlices(RewriterBase &rewriter, Location loc,
Value res, int64_t wSize, int64_t wSizeStep,
SmallVectorImpl<Value> &resVals,
- bool isSingleChanneled) {
+ bool isSingleChanneled,
+ bool isNCWPooling = false) {
if (isSingleChanneled) {
// Write back res slice: {wSizeStep} @ [w] for non-channeled convolution.
@@ -202,6 +225,14 @@ static Value insertConvResultSlices(RewriterBase &rewriter, Location loc,
rewriter, loc, resVals[w], res, /*offsets=*/ArrayRef<int64_t>{w},
strides);
}
+ } else if (isNCWPooling) {
+ // Write back res slice: {n, f, wSizeStep} @ [0, 0, w] for NCW pooling.
+ SmallVector<int64_t> strides = {1, 1, 1};
+ for (int64_t w = 0; w < wSize; w += wSizeStep) {
+ res = vector::InsertStridedSliceOp::create(
+ rewriter, loc, resVals[w], res,
+ /*offsets=*/ArrayRef<int64_t>{0, 0, w}, strides);
+ }
} else {
// Write back res slice: {n, wSizeStep, f} @ [0, w, 0] for channeled
// convolution. This does not depend on kw.
@@ -3436,6 +3467,8 @@ struct Conv1DGenerator
int64_t nSize, wSize, cSize, kwSize, fSize;
SmallVector<int64_t, 3> lhsShape, rhsShape, resShape;
bool isSingleChanneled = (conv1DOpOrder == Conv1DOpOrder::W);
+ bool isNCWPooling = (oper == ConvOperationKind::Pool &&
+ conv1DOpOrder == Conv1DOpOrder::Ncw);
switch (conv1DOpOrder) {
case Conv1DOpOrder::W:
// Initialize unused dimensions
@@ -3555,6 +3588,8 @@ struct Conv1DGenerator
// Base case, so no transposes necessary.
break;
case Conv1DOpOrder::Ncw: {
+ if (isNCWPooling)
+ break;
// To match base vectorization case, we pre-transpose current case.
// ncw -> nwc
static constexpr std::array<int64_t, 3> permLhs = {0, 2, 1};
@@ -3579,12 +3614,13 @@ struct Conv1DGenerator
SmallVector<Value> lhsVals, rhsVals, resVals;
lhsVals = extractConvInputSlices(rewriter, loc, lhs, nSize, wSize, cSize,
kwSize, strideW, dilationW, wSizeStep,
- isSingleChanneled);
+ isSingleChanneled, isNCWPooling);
// Do not do for pooling.
if (oper == ConvOperationKind::Conv)
rhsVals = extractConvFilterSlices(rewriter, loc, rhs, kwSize);
- resVals = extractConvResultSlices(rewriter, loc, res, nSize, wSize, fSize,
- wSizeStep, isSingleChanneled);
+ resVals =
+ extractConvResultSlices(rewriter, loc, res, nSize, wSize, fSize,
+ wSizeStep, isSingleChanneled, isNCWPooling);
auto linearIndex = [&](int64_t kw, int64_t w) {
return kw * (wSize / wSizeStep) + w;
@@ -3616,7 +3652,7 @@ struct Conv1DGenerator
}
res = insertConvResultSlices(rewriter, loc, res, wSize, wSizeStep, resVals,
- isSingleChanneled);
+ isSingleChanneled, isNCWPooling);
//===------------------------------------------------------------------===//
// End vector-only rewrite part
//===------------------------------------------------------------------===//
@@ -3630,6 +3666,8 @@ struct Conv1DGenerator
// Base case, so no transposes necessary.
break;
case Conv1DOpOrder::Ncw: {
+ if (isNCWPooling)
+ break;
// nwf -> nfw
static constexpr std::array<int64_t, 3> perm = {0, 2, 1};
res = vector::TransposeOp::create(rewriter, loc, res, perm);
diff --git a/mlir/test/Dialect/Linalg/vectorization/convolution-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/convolution-with-patterns.mlir
index 97b27befd44e2..d443375e525fc 100644
--- a/mlir/test/Dialect/Linalg/vectorization/convolution-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization/convolution-with-patterns.mlir
@@ -1111,18 +1111,16 @@ func.func @pooling_ncw_sum_memref_1_2_1_3(%input: memref<4x3x4xf32>, %filter: me
// CHECK-DAG: %[[Vcst:.+]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[V0:.+]] = vector.transfer_read %[[INPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]], %[[Vcst]] {in_bounds = [true, true, true]} : memref<4x3x4xf32>, vector<4x3x4xf32>
// CHECK: %[[V1:.+]] = vector.transfer_read %[[OUTPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]], %[[Vcst]] {in_bounds = [true, true, true]} : memref<4x3x2xf32>, vector<4x3x2xf32>
-// CHECK: %[[V2:.+]] = vector.transpose %[[V0]], [0, 2, 1] : vector<4x3x4xf32> to vector<4x4x3xf32>
-// CHECK: %[[V3:.+]] = vector.transpose %[[V1]], [0, 2, 1] : vector<4x3x2xf32> to vector<4x2x3xf32>
-// CHECK: %[[V4:.+]] = vector.extract_strided_slice %[[V2]] {offsets = [0, 0, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x4x3xf32> to vector<4x1x3xf32>
-// CHECK: %[[V5:.+]] = vector.extract_strided_slice %[[V2]] {offsets = [0, 3, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x4x3xf32> to vector<4x1x3xf32>
-// CHECK: %[[V6:.+]] = vector.extract_strided_slice %[[V3]] {offsets = [0, 0, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x2x3xf32> to vector<4x1x3xf32>
-// CHECK: %[[V7:.+]] = vector.extract_strided_slice %[[V3]] {offsets = [0, 1, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x2x3xf32> to vector<4x1x3xf32>
-// CHECK: %[[V8:.+]] = arith.addf %[[V4]], %[[V6]] : vector<4x1x3xf32>
-// CHECK: %[[V9:.+]] = arith.addf %[[V5]], %[[V7]] : vector<4x1x3xf32>
-// CHECK: %[[V10:.+]] = vector.insert_strided_slice %[[V8]], %[[V3]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<4x1x3xf32> into vector<4x2x3xf32>
-// CHECK: %[[V11:.+]] = vector.insert_strided_slice %[[V9]], %[[V10]] {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<4x1x3xf32> into vector<4x2x3xf32>
-// CHECK: %[[V12:.+]] = vector.transpose %[[V11]], [0, 2, 1] : vector<4x2x3xf32> to vector<4x3x2xf32>
-// CHECK: vector.transfer_write %[[V12:.+]], %[[OUTPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]] {in_bounds = [true, true, true]} : vector<4x3x2xf32>, memref<4x3x2xf32>
+// CHECK-NOT: vector.transpose
+// CHECK: %[[V2:.+]] = vector.extract_strided_slice %[[V0]] {offsets = [0, 0, 0], sizes = [4, 3, 1], strides = [1, 1, 1]} : vector<4x3x4xf32> to vector<4x3x1xf32>
+// CHECK: %[[V3:.+]] = vector.extract_strided_slice %[[V0]] {offsets = [0, 0, 3], sizes = [4, 3, 1], strides = [1, 1, 1]} : vector<4x3x4xf32> to vector<4x3x1xf32>
+// CHECK: %[[V4:.+]] = vector.extract_strided_slice %[[V1]] {offsets = [0, 0, 0], sizes = [4, 3, 1], strides = [1, 1, 1]} : vector<4x3x2xf32> to vector<4x3x1xf32>
+// CHECK: %[[V5:.+]] = vector.extract_strided_slice %[[V1]] {offsets = [0, 0, 1], sizes = [4, 3, 1], strides = [1, 1, 1]} : vector<4x3x2xf32> to vector<4x3x1xf32>
+// CHECK: %[[V6:.+]] = arith.addf %[[V2]], %[[V4]] : vector<4x3x1xf32>
+// CHECK: %[[V7:.+]] = arith.addf %[[V3]], %[[V5]] : vector<4x3x1xf32>
+// CHECK: %[[V8:.+]] = vector.insert_strided_slice %[[V6]], %[[V1]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<4x3x1xf32> into vector<4x3x2xf32>
+// CHECK: %[[V9:.+]] = vector.insert_strided_slice %[[V7]], %[[V8]] {offsets = [0, 0, 1], strides = [1, 1, 1]} : vector<4x3x1xf32> into vector<4x3x2xf32>
+// CHECK: vector.transfer_write %[[V9]], %[[OUTPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]] {in_bounds = [true, true, true]} : vector<4x3x2xf32>, memref<4x3x2xf32>
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -1212,22 +1210,20 @@ func.func @pooling_ncw_sum_memref_2_2_2_3(%input: memref<4x3x6xf32>, %filter: me
// CHECK-DAG: %[[Vcst:.+]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[V0:.+]] = vector.transfer_read %[[INPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]], %[[Vcst]] {in_bounds = [true, true, true]} : memref<4x3x6xf32>, vector<4x3x6xf32>
// CHECK: %[[V1:.+]] = vector.transfer_read %[[OUTPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]], %[[Vcst]] {in_bounds = [true, true, true]} : memref<4x3x2xf32>, vector<4x3x2xf32>
-// CHECK: %[[V2:.+]] = vector.transpose %[[V0]], [0, 2, 1] : vector<4x3x6xf32> to vector<4x6x3xf32>
-// CHECK: %[[V3:.+]] = vector.transpose %[[V1]], [0, 2, 1] : vector<4x3x2xf32> to vector<4x2x3xf32>
-// CHECK: %[[V4:.+]] = vector.extract_strided_slice %[[V2]] {offsets = [0, 0, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x6x3xf32> to vector<4x1x3xf32>
-// CHECK: %[[V5:.+]] = vector.extract_strided_slice %[[V2]] {offsets = [0, 3, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x6x3xf32> to vector<4x1x3xf32>
-// CHECK: %[[V6:.+]] = vector.extract_strided_slice %[[V2]] {offsets = [0, 2, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x6x3xf32> to vector<4x1x3xf32>
-// CHECK: %[[V7:.+]] = vector.extract_strided_slice %[[V2]] {offsets = [0, 5, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x6x3xf32> to vector<4x1x3xf32>
-// CHECK: %[[V8:.+]] = vector.extract_strided_slice %[[V3]] {offsets = [0, 0, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x2x3xf32> to vector<4x1x3xf32>
-// CHECK: %[[V9:.+]] = vector.extract_strided_slice %[[V3]] {offsets = [0, 1, 0], sizes = [4, 1, 3], strides = [1, 1, 1]} : vector<4x2x3xf32> to vector<4x1x3xf32>
-// CHECK: %[[V10:.+]] = arith.addf %[[V4]], %[[V8]] : vector<4x1x3xf32>
-// CHECK: %[[V11:.+]] = arith.addf %[[V5]], %[[V9]] : vector<4x1x3xf32>
-// CHECK: %[[V12:.+]] = arith.addf %[[V6]], %[[V10]] : vector<4x1x3xf32>
-// CHECK: %[[V13:.+]] = arith.addf %[[V7]], %[[V11]] : vector<4x1x3xf32>
-// CHECK: %[[V14:.+]] = vector.insert_strided_slice %[[V12]], %[[V3]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<4x1x3xf32> into vector<4x2x3xf32>
-// CHECK: %[[V15:.+]] = vector.insert_strided_slice %[[V13]], %[[V14]] {offsets = [0, 1, 0], strides = [1, 1, 1]} : vector<4x1x3xf32> into vector<4x2x3xf32>
-// CHECK: %[[V16:.+]] = vector.transpose %[[V15]], [0, 2, 1] : vector<4x2x3xf32> to vector<4x3x2xf32>
-// CHECK: vector.transfer_write %[[V16:.+]], %[[OUTPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]] {in_bounds = [true, true, true]} : vector<4x3x2xf32>, memref<4x3x2xf32>
+// CHECK-NOT: vector.transpose
+// CHECK: %[[V2:.+]] = vector.extract_strided_slice %[[V0]] {offsets = [0, 0, 0], sizes = [4, 3, 1], strides = [1, 1, 1]} : vector<4x3x6xf32> to vector<4x3x1xf32>
+// CHECK: %[[V3:.+]] = vector.extract_strided_slice %[[V0]] {offsets = [0, 0, 3], sizes = [4, 3, 1], strides = [1, 1, 1]} : vector<4x3x6xf32> to vector<4x3x1xf32>
+// CHECK: %[[V4:.+]] = vector.extract_strided_slice %[[V0]] {offsets = [0, 0, 2], sizes = [4, 3, 1], strides = [1, 1, 1]} : vector<4x3x6xf32> to vector<4x3x1xf32>
+// CHECK: %[[V5:.+]] = vector.extract_strided_slice %[[V0]] {offsets = [0, 0, 5], sizes = [4, 3, 1], strides = [1, 1, 1]} : vector<4x3x6xf32> to vector<4x3x1xf32>
+// CHECK: %[[V6:.+]] = vector.extract_strided_slice %[[V1]] {offsets = [0, 0, 0], sizes = [4, 3, 1], strides = [1, 1, 1]} : vector<4x3x2xf32> to vector<4x3x1xf32>
+// CHECK: %[[V7:.+]] = vector.extract_strided_slice %[[V1]] {offsets = [0, 0, 1], sizes = [4, 3, 1], strides = [1, 1, 1]} : vector<4x3x2xf32> to vector<4x3x1xf32>
+// CHECK: %[[V8:.+]] = arith.addf %[[V2]], %[[V6]] : vector<4x3x1xf32>
+// CHECK: %[[V9:.+]] = arith.addf %[[V3]], %[[V7]] : vector<4x3x1xf32>
+// CHECK: %[[V10:.+]] = arith.addf %[[V4]], %[[V8]] : vector<4x3x1xf32>
+// CHECK: %[[V11:.+]] = arith.addf %[[V5]], %[[V9]] : vector<4x3x1xf32>
+// CHECK: %[[V12:.+]] = vector.insert_strided_slice %[[V10]], %[[V1]] {offsets = [0, 0, 0], strides = [1, 1, 1]} : vector<4x3x1xf32> into vector<4x3x2xf32>
+// CHECK: %[[V13:.+]] = vector.insert_strided_slice %[[V11]], %[[V12]] {offsets = [0, 0, 1], strides = [1, 1, 1]} : vector<4x3x1xf32> into vector<4x3x2xf32>
+// CHECK: vector.transfer_write %[[V13]], %[[OUTPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]] {in_bounds = [true, true, true]} : vector<4x3x2xf32>, memref<4x3x2xf32>
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
@@ -1254,14 +1250,12 @@ func.func @pooling_ncw_sum_memref_2_3_2_1(%input: memref<4x2x5xf32>, %filter: me
// CHECK-DAG: %[[Vcst:.+]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[V0:.+]] = vector.transfer_read %[[INPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]], %[[Vcst]] {in_bounds = [true, true, true]} : memref<4x2x5xf32>, vector<4x2x5xf32>
// CHECK: %[[V1:.+]] = vector.transfer_read %[[OUTPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]], %[[Vcst]] {in_bounds = [true, true, true]} : memref<4x2x3xf32>, vector<4x2x3xf32>
-// CHECK: %[[V2:.+]] = vector.transpose %[[V0]], [0, 2, 1] : vector<4x2x5xf32> to vector<4x5x2xf32>
-// CHECK: %[[V3:.+]] = vector.transpose %[[V1]], [0, 2, 1] : vector<4x2x3xf32> to vector<4x3x2xf32>
-// CHECK: %[[V4:.+]] = vector.extract_strided_slice %[[V2]] {offsets = [0, 0, 0], sizes = [4, 3, 2], strides = [1, 1, 1]} : vector<4x5x2xf32> to vector<4x3x2xf32>
-// CHECK: %[[V5:.+]] = vector.extract_strided_slice %[[V2]] {offsets = [0, 2, 0], sizes = [4, 3, 2], strides = [1, 1, 1]} : vector<4x5x2xf32> to vector<4x3x2xf32>
-// CHECK: %[[V6:.+]] = arith.addf %[[V4]], %[[V3]] : vector<4x3x2xf32>
-// CHECK: %[[V7:.+]] = arith.addf %[[V5]], %[[V6]] : vector<4x3x2xf32>
-// CHECK: %[[V8:.+]] = vector.transpose %[[V7]], [0, 2, 1] : vector<4x3x2xf32> to vector<4x2x3xf32>
-// CHECK: vector.transfer_write %[[V8:.+]], %[[OUTPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]] {in_bounds = [true, true, true]} : vector<4x2x3xf32>, memref<4x2x3xf32>
+// CHECK-NOT: vector.transpose
+// CHECK: %[[V2:.+]] = vector.extract_strided_slice %[[V0]] {offsets = [0, 0, 0], sizes = [4, 2, 3], strides = [1, 1, 1]} : vector<4x2x5xf32> to vector<4x2x3xf32>
+// CHECK: %[[V3:.+]] = vector.extract_strided_slice %[[V0]] {offsets = [0, 0, 2], sizes = [4, 2, 3], strides = [1, 1, 1]} : vector<4x2x5xf32> to vector<4x2x3xf32>
+// CHECK: %[[V4:.+]] = arith.addf %[[V2]], %[[V1]] : vector<4x2x3xf32>
+// CHECK: %[[V5:.+]] = arith.addf %[[V3]], %[[V4]] : vector<4x2x3xf32>
+// CHECK: vector.transfer_write %[[V5]], %[[OUTPUT]][%[[Vc0]], %[[Vc0]], %[[Vc0]]] {in_bounds = [true, true, true]} : vector<4x2x3xf32>, memref<4x2x3xf32>
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
``````````
</details>
https://github.com/llvm/llvm-project/pull/193208
More information about the Mlir-commits
mailing list