[Mlir-commits] [mlir] [MLIR][Vector] Fix a narrow byte emulation alignment issue (PR #137970)

Wed Apr 30 07:25:51 PDT 2025

https://github.com/lialan created https://github.com/llvm/llvm-project/pull/137970

This is a follow up of https://github.com/llvm/llvm-project/pull/133231 which fixes an issue with partially loading front byte. Basically this patch:

* removes one unnecessary guard.
* Fix an issue when the front byte is not partially updated, we do not need to increment the count.

>From 759e87a3d4472b98e93d66027717ec969bcae937 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Wed, 30 Apr 2025 14:11:19 +0000
Subject: [PATCH] [MLIR][Vector] Fix a narrow byte emulation alignment issue

---
 .../Transforms/VectorEmulateNarrowType.cpp    | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index a560aa1b1e680..0f6c9ea267763 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -621,12 +621,6 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
             ? 0
             : getConstantIntValue(linearizedInfo.intraDataOffset);
 
-    if (!foldedNumFrontPadElems) {
-      return rewriter.notifyMatchFailure(
-          op, "subbyte store emulation: dynamic front padding size is "
-              "not yet implemented");
-    }
-
     auto memrefBase = cast<MemRefValue>(adaptor.getBase());
 
     // RMWs are not needed when:
@@ -722,6 +716,8 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
     auto frontSubWidthStoreElem =
         (emulatedPerContainerElem - *foldedNumFrontPadElems) %
         emulatedPerContainerElem;
+
+    bool partiallyStoredFrontByte = false;
     if (frontSubWidthStoreElem > 0) {
       SmallVector<bool> frontMaskValues(emulatedPerContainerElem, false);
       if (*foldedNumFrontPadElems + origElements < emulatedPerContainerElem) {
@@ -742,6 +738,7 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
 
       storeFunc(rewriter, loc, memrefBase, currentDestIndex,
                 cast<VectorValue>(value), frontMask.getResult());
+      partiallyStoredFrontByte = true;
     }
 
     if (currentSourceIndex >= origElements) {
@@ -749,11 +746,13 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
       return success();
     }
 
-    // Increment the destination index by 1 to align to the emulated width
-    // boundary.
-    auto constantOne = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    currentDestIndex = rewriter.create<arith::AddIOp>(
-        loc, rewriter.getIndexType(), currentDestIndex, constantOne);
+    if (partiallyStoredFrontByte) {
+      // Increment the destination index by 1 to align to the emulated width
+      // boundary, if the front byte was partially stored.
+      auto constantOne = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+      currentDestIndex = rewriter.create<arith::AddIOp>(
+          loc, rewriter.getIndexType(), currentDestIndex, constantOne);
+    }
 
     // 2. Full width store for the inner output bytes.
     // After the previous step, the store address is aligned to the emulated