[Mlir-commits] [mlir] [MLIR] Implement emulation of static indexing subbyte type vector stores (PR #115922)
Andrzej Warzyński
llvmlistbot at llvm.org
Thu Jan 16 09:52:19 PST 2025
================
@@ -336,30 +442,178 @@ struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
// vector.store %bitcast, %alloc[%linear_index] : memref<16xi8>,
// vector<4xi8>
- auto origElements = op.getValueToStore().getType().getNumElements();
- if (origElements % scale != 0)
- return failure();
+ auto origElements = valueToStore.getType().getNumElements();
+ bool isAlignedEmulation = origElements % numSrcElemsPerDest == 0;
auto stridedMetadata =
rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
OpFoldResult linearizedIndices;
- std::tie(std::ignore, linearizedIndices) =
+ memref::LinearizedMemRefInfo linearizedInfo;
+ std::tie(linearizedInfo, linearizedIndices) =
memref::getLinearizedMemRefOffsetAndSize(
rewriter, loc, srcBits, dstBits,
stridedMetadata.getConstifiedMixedOffset(),
stridedMetadata.getConstifiedMixedSizes(),
stridedMetadata.getConstifiedMixedStrides(),
getAsOpFoldResult(adaptor.getIndices()));
- auto numElements = origElements / scale;
- auto bitCast = rewriter.create<vector::BitCastOp>(
- loc, VectorType::get(numElements, newElementType),
- op.getValueToStore());
+ std::optional<int64_t> foldedNumFrontPadElems =
+ isAlignedEmulation
+ ? 0
+ : getConstantIntValue(linearizedInfo.intraDataOffset);
+
+ if (!foldedNumFrontPadElems) {
+ return failure("subbyte store emulation: dynamic front padding size is "
+ "not yet implemented");
+ }
+
+ auto memrefBase = cast<MemRefValue>(adaptor.getBase());
- rewriter.replaceOpWithNewOp<vector::StoreOp>(
- op, bitCast.getResult(), adaptor.getBase(),
- getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
+ // Shortcut: conditions when subbyte emulated store at the front is not
+ // needed:
+ // 1. The source vector size (in bits) is a multiple of byte size.
+ // 2. The address of the store is aligned to the emulated width boundary.
+ //
+ // For example, to store a vector<4xi2> to <13xi2> at offset 4, does not
+ // need unaligned emulation because the store address is aligned and the
+ // source is a whole byte.
+ if (isAlignedEmulation && *foldedNumFrontPadElems == 0) {
+ auto numElements = origElements / numSrcElemsPerDest;
+ auto bitCast = rewriter.create<vector::BitCastOp>(
+ loc, VectorType::get(numElements, newElementType),
+ op.getValueToStore());
+ rewriter.replaceOpWithNewOp<vector::StoreOp>(
+ op, bitCast.getResult(), memrefBase,
+ getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
+ return success();
+ }
----------------
banach-space wrote:
>From what I can tell, this block is not tested. Something like this will exercise it:
```mlir
// Aligned store, hence full stores.
func.func @vector_store_i2_one_full_store_multiple_bytes(%arg0: vector<32xi2>) {
%alloc = memref.alloc() : memref<8xi8>
%0 = vector.bitcast %arg0 : vector<32xi2> to vector<8xi8>
%c0 = arith.constant 0 : index
vector.store %0, %alloc[%c0] : memref<8xi8>, vector<8xi8>
return
}
```
Feel free to re-use it. Also, I don't really like expressions like "shortcut". Instead, IMO, this is a special "case". To demonstrate what I have in mind:
```cpp
bool emulationRequiresPartialStores = isAlignedEmulation && *foldedNumFrontPadElems;
If (!emulationRequiresPartialStores) {
// Basic case, storing full bytes.
// Your code here.
}
// Complex case, emulation requires partial stores.
```
Something along those lines 😅
https://github.com/llvm/llvm-project/pull/115922
More information about the Mlir-commits
mailing list