[Mlir-commits] [mlir] [MLIR] Implement emulation of static indexing subbyte type vector stores (PR #115922)

Thu Jan 16 09:52:19 PST 2025

================
@@ -336,30 +442,178 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
     // vector.store %bitcast, %alloc[%linear_index] : memref<16xi8>,
     // vector<4xi8>
 
-    auto origElements = op.getValueToStore().getType().getNumElements();
-    if (origElements % scale != 0)
-      return failure();
+    auto origElements = valueToStore.getType().getNumElements();
+    bool isAlignedEmulation = origElements % numSrcElemsPerDest == 0;
 
     auto stridedMetadata =
         rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
 
     OpFoldResult linearizedIndices;
-    std::tie(std::ignore, linearizedIndices) =
+    memref::LinearizedMemRefInfo linearizedInfo;
+    std::tie(linearizedInfo, linearizedIndices) =
         memref::getLinearizedMemRefOffsetAndSize(
             rewriter, loc, srcBits, dstBits,
             stridedMetadata.getConstifiedMixedOffset(),
             stridedMetadata.getConstifiedMixedSizes(),
             stridedMetadata.getConstifiedMixedStrides(),
             getAsOpFoldResult(adaptor.getIndices()));
 
-    auto numElements = origElements / scale;
-    auto bitCast = rewriter.create<vector::BitCastOp>(
-        loc, VectorType::get(numElements, newElementType),
-        op.getValueToStore());
+    std::optional<int64_t> foldedNumFrontPadElems =
+        isAlignedEmulation
+            ? 0
+            : getConstantIntValue(linearizedInfo.intraDataOffset);
+
+    if (!foldedNumFrontPadElems) {
+      return failure("subbyte store emulation: dynamic front padding size is "
+                     "not yet implemented");
+    }
+
+    auto memrefBase = cast<MemRefValue>(adaptor.getBase());
 
-    rewriter.replaceOpWithNewOp<vector::StoreOp>(
-        op, bitCast.getResult(), adaptor.getBase(),
-        getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
+    // Shortcut: conditions when subbyte emulated store at the front is not
+    // needed:
+    // 1. The source vector size (in bits) is a multiple of byte size.
+    // 2. The address of the store is aligned to the emulated width boundary.
+    //
+    // For example, to store a vector<4xi2> to <13xi2> at offset 4, does not
+    // need unaligned emulation because the store address is aligned and the
+    // source is a whole byte.
+    if (isAlignedEmulation && *foldedNumFrontPadElems == 0) {
+      auto numElements = origElements / numSrcElemsPerDest;
+      auto bitCast = rewriter.create<vector::BitCastOp>(
+          loc, VectorType::get(numElements, newElementType),
+          op.getValueToStore());
+      rewriter.replaceOpWithNewOp<vector::StoreOp>(
+          op, bitCast.getResult(), memrefBase,
+          getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
+      return success();
+    }
----------------
banach-space wrote:

>From what I can tell, this block is not tested. Something like this will exercise it:

```mlir
  // Aligned store, hence full stores.
  
  func.func @vector_store_i2_one_full_store_multiple_bytes(%arg0: vector<32xi2>) {
    %alloc = memref.alloc() : memref<8xi8>
    %0 = vector.bitcast %arg0 : vector<32xi2> to vector<8xi8>
    %c0 = arith.constant 0 : index
    vector.store %0, %alloc[%c0] : memref<8xi8>, vector<8xi8>
    return
  }
```

Feel free to re-use it. Also, I don't really like expressions like "shortcut". Instead, IMO, this is a special "case".  To demonstrate what I have in mind:
```cpp
bool emulationRequiresPartialStores = isAlignedEmulation && *foldedNumFrontPadElems;
If (!emulationRequiresPartialStores) {
   // Basic case, storing full bytes.
   // Your code here.
}

// Complex case, emulation requires partial stores.
```

Something along those lines 😅 

https://github.com/llvm/llvm-project/pull/115922