[Mlir-commits] [mlir] [mlir][Vector] Update VectorEmulateNarrowType.cpp (2/N) (PR #123527)

Sun Feb 2 07:46:16 PST 2025

https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/123527

>From 3954c8f5dafae1f3435d490b3f114420dabbe925 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Sun, 2 Feb 2025 15:21:41 +0000
Subject: [PATCH 1/2] [mlir][vector][nfc] Fix typos in
 "VectorEmulateNarrowType.cpp"

Updates `emulatedVectorLoad` that was introduced in #115922.
Specifically, ATM `emulatedVectorLoad` mixes "emulated type" and
"container type". This only became clear after #123526 in which the
concepts of "emulated" and "container" types were introduced.

This is an NFC change and simply updates the variable naming.
---
 .../Transforms/VectorEmulateNarrowType.cpp    | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index 63365cb5446124..0d310dc8be2fe9 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -278,23 +278,25 @@ static Value dynamicallyInsertSubVector(RewriterBase &rewriter, Location loc,
   return dest;
 }
 
-/// Returns the op sequence for an emulated sub-byte data type vector load.
-/// specifically, use `emulatedElemType` for loading a vector of `origElemType`.
-/// The load location is given by `base` and `linearizedIndices`, and the
-/// load size is given by `numEmulatedElementsToLoad`.
+/// Emulate a vector load for `emulatedElemTy` using `containerElemTy`
+///
+/// Specifically, use `containerElemTy` for loading a vector of
+/// `emulatedElemTy`. The load location is given by `base` and
+/// `linearizedIndices`, and the load size is given by
+/// `numEmulatedElementsToLoad`.
 static VectorValue emulatedVectorLoad(OpBuilder &rewriter, Location loc,
                                       Value base,
                                       OpFoldResult linearizedIndices,
-                                      int64_t numEmultedElementsToLoad,
-                                      Type origElemType,
-                                      Type emulatedElemType) {
-  auto scale = emulatedElemType.getIntOrFloatBitWidth() /
-               origElemType.getIntOrFloatBitWidth();
+                                      int64_t numContainerElemsToLoad,
+                                      Type emulatedElemTy,
+                                      Type containerElemTy) {
+  auto scale = containerElemTy.getIntOrFloatBitWidth() /
+               emulatedElemTy.getIntOrFloatBitWidth();
   auto newLoad = rewriter.create<vector::LoadOp>(
-      loc, VectorType::get(numEmultedElementsToLoad, emulatedElemType), base,
+      loc, VectorType::get(numContainerElemsToLoad, containerElemTy), base,
       getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
   return rewriter.create<vector::BitCastOp>(
-      loc, VectorType::get(numEmultedElementsToLoad * scale, origElemType),
+      loc, VectorType::get(numContainerElemsToLoad * scale, emulatedElemTy),
       newLoad);
 }
 

>From 9fab1bb9988cacea6075690f74dcec0c6740b2f8 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Sun, 2 Feb 2025 15:36:33 +0000
Subject: [PATCH 2/2] [mlir][Vector] Update VectorEmulateNarrowType.cpp (2/N)

This is PR 2 in a series of N patches aimed at improving
"VectorEmulateNarrowType.cpp". This is mainly minor refactoring, no
major functional changes are made/added.

This PR renames the variable "scale". Note, "scale" could mean either:

  * "original-elements-per-emulated-type", or
  * "emulated-elements-per-original-type".

While from the context it is clear that it's always the former (original
type is always a sub-byte type and the emulated type is usually `i8`),
this PR reduces the cognitive load by making this clear.

**DEPENDS ON:**
* #123526 123526

Please only review the [top
commit](https://github.com/llvm/llvm-project/pull/123527/commits/d40b31bb098e874be488182050c68b887e8d091a).

**GitHub issue to track this work**:
https://github.com/llvm/llvm-project/issues/123630
---
 .../Transforms/VectorEmulateNarrowType.cpp    | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index 0d310dc8be2fe9..a0e170c7a04125 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -290,13 +290,13 @@ static VectorValue emulatedVectorLoad(OpBuilder &rewriter, Location loc,
                                       int64_t numContainerElemsToLoad,
                                       Type emulatedElemTy,
                                       Type containerElemTy) {
-  auto scale = containerElemTy.getIntOrFloatBitWidth() /
+  auto emulatedPerContainerElem = containerElemTy.getIntOrFloatBitWidth() /
                emulatedElemTy.getIntOrFloatBitWidth();
   auto newLoad = rewriter.create<vector::LoadOp>(
       loc, VectorType::get(numContainerElemsToLoad, containerElemTy), base,
       getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
   return rewriter.create<vector::BitCastOp>(
-      loc, VectorType::get(numContainerElemsToLoad * scale, emulatedElemTy),
+      loc, VectorType::get(numContainerElemsToLoad * emulatedPerContainerElem, emulatedElemTy),
       newLoad);
 }
 
@@ -388,10 +388,10 @@ static Value extractSliceIntoByte(ConversionPatternRewriter &rewriter,
       "sliceNumElements * vector element size must be less than or equal to 8");
   assert(8 % vectorElementType.getIntOrFloatBitWidth() == 0 &&
          "vector element must be a valid sub-byte type");
-  auto scale = 8 / vectorElementType.getIntOrFloatBitWidth();
+  auto emulatedPerContainerElem = 8 / vectorElementType.getIntOrFloatBitWidth();
   auto emptyByteVector = rewriter.create<arith::ConstantOp>(
-      loc, VectorType::get({scale}, vectorElementType),
-      rewriter.getZeroAttr(VectorType::get({scale}, vectorElementType)));
+      loc, VectorType::get({emulatedPerContainerElem}, vectorElementType),
+      rewriter.getZeroAttr(VectorType::get({emulatedPerContainerElem}, vectorElementType)));
   auto extracted = staticallyExtractSubvector(rewriter, loc, vector,
                                               extractOffset, sliceNumElements);
   return staticallyInsertSubvector(rewriter, loc, extracted, emptyByteVector,
@@ -656,9 +656,9 @@ struct ConvertVectorMaskedStore final
               "(bit-wise misalignment)");
     }
 
-    int scale = containerBits / emulatedBits;
+    int emulatedPerContainerElem = containerBits / emulatedBits;
     int origElements = op.getValueToStore().getType().getNumElements();
-    if (origElements % scale != 0)
+    if (origElements % emulatedPerContainerElem != 0)
       return failure();
 
     auto stridedMetadata =
@@ -708,11 +708,11 @@ struct ConvertVectorMaskedStore final
     // FIXME: Make an example based on the comment above work (see #115460 for
     // reproducer).
     FailureOr<Operation *> newMask =
-        getCompressedMaskOp(rewriter, loc, op.getMask(), origElements, scale);
+        getCompressedMaskOp(rewriter, loc, op.getMask(), origElements, emulatedPerContainerElem);
     if (failed(newMask))
       return failure();
 
-    auto numElements = (origElements + scale - 1) / scale;
+    auto numElements = (origElements + emulatedPerContainerElem - 1) / emulatedPerContainerElem;
     auto newType = VectorType::get(numElements, containerElemTy);
     auto passThru = rewriter.create<arith::ConstantOp>(
         loc, newType, rewriter.getZeroAttr(newType));
@@ -721,7 +721,7 @@ struct ConvertVectorMaskedStore final
         loc, newType, adaptor.getBase(), linearizedIndices,
         newMask.value()->getResult(0), passThru);
 
-    auto newBitCastType = VectorType::get(numElements * scale, emulatedElemTy);
+    auto newBitCastType = VectorType::get(numElements * emulatedPerContainerElem, emulatedElemTy);
     Value valueToStore =
         rewriter.create<vector::BitCastOp>(loc, newBitCastType, newLoad);
     valueToStore = rewriter.create<arith::SelectOp>(
@@ -765,7 +765,7 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
           op, "impossible to pack emulated elements into container elements "
               "(bit-wise misalignment)");
     }
-    int scale = containerBits / emulatedBits;
+    int emulatedPerContainerElem = containerBits / emulatedBits;
 
     // Adjust the number of elements to load when emulating narrow types,
     // and then cast back to the original type with vector.bitcast op.
@@ -797,7 +797,7 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
     // compile time as they must be constants.
 
     auto origElements = op.getVectorType().getNumElements();
-    bool isAlignedEmulation = origElements % scale == 0;
+    bool isAlignedEmulation = origElements % emulatedPerContainerElem == 0;
 
     auto stridedMetadata =
         rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
@@ -818,9 +818,9 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
             : getConstantIntValue(linearizedInfo.intraDataOffset);
 
     // Always load enough elements which can cover the original elements.
-    int64_t maxintraDataOffset = foldedIntraVectorOffset.value_or(scale - 1);
+    int64_t maxintraDataOffset = foldedIntraVectorOffset.value_or(emulatedPerContainerElem - 1);
     auto numElements =
-        llvm::divideCeil(maxintraDataOffset + origElements, scale);
+        llvm::divideCeil(maxintraDataOffset + origElements, emulatedPerContainerElem);
     Value result =
         emulatedVectorLoad(rewriter, loc, adaptor.getBase(), linearizedIndices,
                            numElements, emulatedElemTy, containerElemTy);
@@ -870,7 +870,7 @@ struct ConvertVectorMaskedLoad final
           op, "impossible to pack emulated elements into container elements "
               "(bit-wise misalignment)");
     }
-    int scale = containerBits / emulatedBits;
+    int emulatedPerContainerElem = containerBits / emulatedBits;
 
     // Adjust the number of elements to load when emulating narrow types,
     // and then cast back to the original type with vector.bitcast op.
@@ -916,7 +916,7 @@ struct ConvertVectorMaskedLoad final
     // subvector at the proper offset after bit-casting.
     auto origType = op.getVectorType();
     auto origElements = origType.getNumElements();
-    bool isAlignedEmulation = origElements % scale == 0;
+    bool isAlignedEmulation = origElements % emulatedPerContainerElem == 0;
 
     auto stridedMetadata =
         rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
@@ -935,18 +935,18 @@ struct ConvertVectorMaskedLoad final
             ? 0
             : getConstantIntValue(linearizedInfo.intraDataOffset);
 
-    int64_t maxIntraDataOffset = foldedIntraVectorOffset.value_or(scale - 1);
+    int64_t maxIntraDataOffset = foldedIntraVectorOffset.value_or(emulatedPerContainerElem - 1);
     FailureOr<Operation *> newMask = getCompressedMaskOp(
-        rewriter, loc, op.getMask(), origElements, scale, maxIntraDataOffset);
+        rewriter, loc, op.getMask(), origElements, emulatedPerContainerElem, maxIntraDataOffset);
     if (failed(newMask))
       return failure();
 
     Value passthru = op.getPassThru();
 
     auto numElements =
-        llvm::divideCeil(maxIntraDataOffset + origElements, scale);
+        llvm::divideCeil(maxIntraDataOffset + origElements, emulatedPerContainerElem);
     auto loadType = VectorType::get(numElements, containerElemTy);
-    auto newBitcastType = VectorType::get(numElements * scale, emulatedElemTy);
+    auto newBitcastType = VectorType::get(numElements * emulatedPerContainerElem, emulatedElemTy);
 
     auto emptyVector = rewriter.create<arith::ConstantOp>(
         loc, newBitcastType, rewriter.getZeroAttr(newBitcastType));
@@ -974,7 +974,7 @@ struct ConvertVectorMaskedLoad final
 
     Value mask = op.getMask();
     auto newSelectMaskType =
-        VectorType::get(numElements * scale, rewriter.getI1Type());
+        VectorType::get(numElements * emulatedPerContainerElem, rewriter.getI1Type());
     // TODO: try to fold if op's mask is constant
     auto emptyMask = rewriter.create<arith::ConstantOp>(
         loc, newSelectMaskType, rewriter.getZeroAttr(newSelectMaskType));
@@ -1033,11 +1033,11 @@ struct ConvertVectorTransferRead final
           op, "impossible to pack emulated elements into container elements "
               "(bit-wise misalignment)");
     }
-    int scale = containerBits / emulatedBits;
+    int emulatedPerContainerElem = containerBits / emulatedBits;
 
     auto origElements = op.getVectorType().getNumElements();
 
-    bool isAlignedEmulation = origElements % scale == 0;
+    bool isAlignedEmulation = origElements % emulatedPerContainerElem == 0;
 
     auto newPadding = rewriter.create<arith::ExtUIOp>(loc, containerElemTy,
                                                       adaptor.getPadding());
@@ -1060,9 +1060,9 @@ struct ConvertVectorTransferRead final
             ? 0
             : getConstantIntValue(linearizedInfo.intraDataOffset);
 
-    int64_t maxIntraDataOffset = foldedIntraVectorOffset.value_or(scale - 1);
+    int64_t maxIntraDataOffset = foldedIntraVectorOffset.value_or(emulatedPerContainerElem - 1);
     auto numElements =
-        llvm::divideCeil(maxIntraDataOffset + origElements, scale);
+        llvm::divideCeil(maxIntraDataOffset + origElements, emulatedPerContainerElem);
 
     auto newRead = rewriter.create<vector::TransferReadOp>(
         loc, VectorType::get(numElements, containerElemTy), adaptor.getSource(),
@@ -1070,7 +1070,7 @@ struct ConvertVectorTransferRead final
         newPadding);
 
     auto bitCast = rewriter.create<vector::BitCastOp>(
-        loc, VectorType::get(numElements * scale, emulatedElemTy), newRead);
+        loc, VectorType::get(numElements * emulatedPerContainerElem, emulatedElemTy), newRead);
 
     Value result = bitCast->getResult(0);
     if (!foldedIntraVectorOffset) {