[Mlir-commits] [mlir] 84c66f4 - [mlir][vector] Add assumeAligned mode to vector.store narrow type emulation (#178565)

Thu Jan 29 10:27:23 PST 2026

Author: Han-Chung Wang
Date: 2026-01-29T18:27:18Z
New Revision: 84c66f4f0d2917fbf21f8a37718b9cbe56a19db1

URL: https://github.com/llvm/llvm-project/commit/84c66f4f0d2917fbf21f8a37718b9cbe56a19db1
DIFF: https://github.com/llvm/llvm-project/commit/84c66f4f0d2917fbf21f8a37718b9cbe56a19db1.diff

LOG: [mlir][vector] Add assumeAligned mode to vector.store narrow type emulation (#178565)

The revision adds a new `assumeAligned` mode to the emulation, so
downstream projects can use simple path when it meets the requirements.
E.g., if the offset is always aligned with container's element type, we
can skip the check of front padding sizes.

---------

Signed-off-by: hanhanW <hanhan0912 at gmail.com>

Added: 
    mlir/test/Dialect/Vector/vector-emulate-narrow-type-aligned-store-only.mlir

Modified: 
    mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
    mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
    mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
index 13ad8151029f2..00b321fecc09f 100644

--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
@@ -388,10 +388,15 @@ void populateVectorMaskMaterializationPatterns(RewritePatternSet &patterns,
 /// Appends patterns for emulating vector operations over narrow types with ops
 /// over wider types. The `disableAtomicRMW` indicates whether to use a normal
 /// read-modify-write sequence instead of using `memref.generic_atomic_rmw` to
-/// perform subbyte storing.
+/// perform subbyte storing. When `assumeAligned` is true, store offsets are
+/// assumed to be aligned to container element boundaries, so a store whose
+/// source vector fills whole container elements is emitted as a simple
+/// bitcast + store without checking the offset. Stores that are not divisible
+/// in size are rejected.
 void populateVectorNarrowTypeEmulationPatterns(
     const arith::NarrowTypeEmulationConverter &typeConverter,
-    RewritePatternSet &patterns, bool disableAtomicRMW = false);
+    RewritePatternSet &patterns, bool disableAtomicRMW = false,
+    bool assumeAligned = false);
 
 /// Populates patterns for both MeMref flattening and Vector narrow type
 /// emulation.

diff  --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index ccb3c01669f18..c53242b526cb6 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -511,6 +511,13 @@ namespace {
 
 // Emulate `vector.store` using a multi-byte container type.
 //
+// When `assumeAligned` is true, store offsets are assumed to be aligned to
+// container element boundaries, so a store whose source vector fills whole
+// container elements (isDivisibleInSize) is emitted as a simple bitcast +
+// store without checking the offset. Stores that are not divisible in size
+// are rejected. This is useful for downstream users that have already
+// ensured alignment.
+//
 // The container type is obtained through Op adaptor and would normally be
 // generated via `NarrowTypeEmulationConverter`.
 //
@@ -551,9 +558,10 @@ namespace {
 struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
   using Base::Base;
 
-  ConvertVectorStore(MLIRContext *context, bool disableAtomicRMW)
+  ConvertVectorStore(MLIRContext *context, bool disableAtomicRMW,
+                     bool assumeAligned)
       : OpConversionPattern<vector::StoreOp>(context),
-        disableAtomicRMW(disableAtomicRMW) {}
+        disableAtomicRMW(disableAtomicRMW), assumeAligned(assumeAligned) {}
 
   LogicalResult
   matchAndRewrite(vector::StoreOp op, OpAdaptor adaptor,
@@ -596,6 +604,37 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
     auto origElements = valueToStore.getType().getNumElements();
     // Note, per-element-alignment was already verified above.
     bool isDivisibleInSize = origElements % emulatedPerContainerElem == 0;
+
+    // In assume-aligned mode, isDivisibleInSize alone is sufficient — the
+    // caller guarantees that store offsets are aligned to container element
+    // boundaries.
+    if (assumeAligned) {
+      if (!isDivisibleInSize)
+        return rewriter.notifyMatchFailure(
+            op, "the source vector does not fill whole container elements "
+                "(not divisible in size)");
+
+      auto stridedMetadata =
+          memref::ExtractStridedMetadataOp::create(rewriter, loc, op.getBase());
+      OpFoldResult linearizedIndices;
+      std::tie(std::ignore, linearizedIndices) =
+          memref::getLinearizedMemRefOffsetAndSize(
+              rewriter, loc, emulatedBits, containerBits,
+              stridedMetadata.getConstifiedMixedOffset(),
+              stridedMetadata.getConstifiedMixedSizes(),
+              stridedMetadata.getConstifiedMixedStrides(),
+              getAsOpFoldResult(adaptor.getIndices()));
+      auto memrefBase = cast<MemRefValue>(adaptor.getBase());
+      int numElements = origElements / emulatedPerContainerElem;
+      auto bitCast = vector::BitCastOp::create(
+          rewriter, loc, VectorType::get(numElements, containerElemTy),
+          op.getValueToStore());
+      rewriter.replaceOpWithNewOp<vector::StoreOp>(
+          op, bitCast.getResult(), memrefBase,
+          getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
+      return success();
+    }
+
     // Do the trailing dim for source and destination match? If yes, then the
     // corresponding index must be 0.
     // FIXME: There's no way to tell for dynamic shapes, so we should bail out.
@@ -813,6 +852,7 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
 
 private:
   const bool disableAtomicRMW;
+  const bool assumeAligned;
 };
 
 //===----------------------------------------------------------------------===//
@@ -2245,7 +2285,7 @@ struct RewriteVectorTranspose : OpRewritePattern<vector::TransposeOp> {
 // The emulated type is inferred from the converted memref type.
 void vector::populateVectorNarrowTypeEmulationPatterns(
     const arith::NarrowTypeEmulationConverter &typeConverter,
-    RewritePatternSet &patterns, bool disableAtomicRMW) {
+    RewritePatternSet &patterns, bool disableAtomicRMW, bool assumeAligned) {
   // Populate `vector.*` conversion patterns.
   // TODO: #119553 support atomicity
   patterns.add<ConvertVectorLoad, ConvertVectorMaskedLoad,
@@ -2255,7 +2295,8 @@ void vector::populateVectorNarrowTypeEmulationPatterns(
   // Populate `vector.*` store conversion patterns. The caller can choose
   // to avoid emitting atomic operations and reduce it to read-modify-write
   // sequence for stores if it is known there are no thread contentions.
-  patterns.insert<ConvertVectorStore>(patterns.getContext(), disableAtomicRMW);
+  patterns.insert<ConvertVectorStore>(patterns.getContext(), disableAtomicRMW,
+                                      assumeAligned);
 }
 
 void vector::populateVectorNarrowTypeRewritePatterns(

diff  --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type-aligned-store-only.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-aligned-store-only.mlir
new file mode 100644
index 0000000000000..2ef568fa6a741
--- /dev/null
+++ b/mlir/test/Dialect/Vector/vector-emulate-narrow-type-aligned-store-only.mlir
@@ -0,0 +1,55 @@
+// RUN: mlir-opt --test-emulate-narrow-int="memref-load-bitwidth=8 assume-aligned=true" --cse --verify-diagnostics --split-input-file %s | FileCheck %s
+
+/// Aligned store, constant index - the source vector fills whole container
+/// elements. Produces a simple bitcast + store.
+func.func @vector_store_i4_aligned_const(%arg0: vector<8xi4>, %arg1: index, %arg2: index) {
+    %0 = memref.alloc() : memref<4x8xi4>
+    vector.store %arg0, %0[%arg1, %arg2] : memref<4x8xi4>, vector<8xi4>
+    return
+}
+//  CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
+//      CHECK: func @vector_store_i4_aligned_const
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: vector<8xi4>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9]+]]: index
+//      CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<16xi8>
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP]]()[%[[ARG1]], %[[ARG2]]]
+//      CHECK:   %[[VEC_I8:.+]] = vector.bitcast %[[ARG0]] : vector<8xi4> to vector<4xi8>
+//      CHECK:   vector.store %[[VEC_I8]], %[[ALLOC]][%[[INDEX]]] : memref<16xi8>, vector<4xi8>
+
+// -----
+
+/// Aligned store, dynamic index. The source vector (8 x i4 = 32 bits) is a
+/// whole multiple of the container element size (i8 = 8 bits), so no partial
+/// stores are needed. This holds regardless of the dynamic offset.
+func.func @vector_store_i4_aligned_dynamic(%arg0: vector<8xi4>, %arg1: index, %arg2: index, %arg3: index, %arg4: index) {
+    %0 = memref.alloc(%arg1, %arg2) : memref<?x?xi4>
+    vector.store %arg0, %0[%arg3, %arg4] : memref<?x?xi4>, vector<8xi4>
+    return
+}
+//  CHECK-DAG: #[[$MAP:.+]] = affine_map<()[s0, s1] -> ((s0 * s1) floordiv 2, s0 floordiv 2)>
+//  CHECK-DAG: #[[$MAP1:.+]] = affine_map<()[s0, s1, s2] -> ((s2 + s0 * s1) floordiv 2)>
+//      CHECK: func @vector_store_i4_aligned_dynamic
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: vector<8xi4>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG4:[a-zA-Z0-9]+]]: index
+//      CHECK:   %[[SIZE:.+]] = affine.max #[[$MAP]]()[%[[ARG2]], %[[ARG1]]]
+//      CHECK:   %[[ALLOC:.+]] = memref.alloc(%[[SIZE]]) : memref<?xi8>
+//      CHECK:   %[[INDEX:.+]] = affine.apply #[[$MAP1]]()[%[[ARG3]], %[[ARG2]], %[[ARG4]]]
+//      CHECK:   %[[VEC_I8:.+]] = vector.bitcast %[[ARG0]] : vector<8xi4> to vector<4xi8>
+//      CHECK:   vector.store %[[VEC_I8]], %[[ALLOC]][%[[INDEX]]] : memref<?xi8>, vector<4xi8>
+
+// -----
+
+/// The source vector does not fill whole container elements (3 x i4 != N x i8),
+/// so the aligned pattern rejects it. With aligned-store-only, no unaligned
+/// pattern is available, so legalization fails.
+func.func @vector_store_i4_not_divisible(%arg0: vector<3xi4>) {
+    %0 = memref.alloc() : memref<12xi4>
+    %c0 = arith.constant 0 : index
+    // expected-error @below {{failed to legalize operation 'vector.store' that was explicitly marked illegal}}
+    vector.store %arg0, %0[%c0] : memref<12xi4>, vector<3xi4>
+    return
+}

diff  --git a/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp b/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp
index b5f015aff19b4..921afb8d2180a 100644
--- a/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp
+++ b/mlir/test/lib/Dialect/MemRef/TestEmulateNarrowType.cpp
@@ -100,8 +100,8 @@ struct TestEmulateNarrowTypePass
 
     arith::populateArithNarrowTypeEmulationPatterns(typeConverter, patterns);
     memref::populateMemRefNarrowTypeEmulationPatterns(typeConverter, patterns);
-    vector::populateVectorNarrowTypeEmulationPatterns(typeConverter, patterns,
-                                                      disableAtomicRMW);
+    vector::populateVectorNarrowTypeEmulationPatterns(
+        typeConverter, patterns, disableAtomicRMW, assumeAligned);
 
     if (failed(applyPartialConversion(op, target, std::move(patterns))))
       signalPassFailure();
@@ -126,6 +126,12 @@ struct TestEmulateNarrowTypePass
       llvm::cl::desc("disable atomic read-modify-write and prefer generating "
                      "normal sequence"),
       llvm::cl::init(false)};
+
+  Option<bool> assumeAligned{
+      *this, "assume-aligned",
+      llvm::cl::desc("assume store offsets are aligned to container element "
+                     "boundaries"),
+      llvm::cl::init(false)};
 };
 
 struct TestMemRefFlattenAndVectorNarrowTypeEmulationPass