[Mlir-commits] [mlir] [mlir][Vector] Add narrow type emulation pattern for vector.maskedload (PR #68443)

Wed Oct 18 11:31:04 PDT 2023

================
@@ -103,6 +104,212 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// ConvertVectorMaskedLoad
+//===----------------------------------------------------------------------===//
+
+struct ConvertVectorMaskedLoad final
+    : OpConversionPattern<vector::MaskedLoadOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::MaskedLoadOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto loc = op.getLoc();
+    auto convertedType = cast<MemRefType>(adaptor.getBase().getType());
+    Type oldElementType = op.getType().getElementType();
+    Type newElementType = convertedType.getElementType();
+    int srcBits = oldElementType.getIntOrFloatBitWidth();
+    int dstBits = newElementType.getIntOrFloatBitWidth();
+
+    if (dstBits % srcBits != 0) {
+      return rewriter.notifyMatchFailure(
+          op, "only dstBits % srcBits == 0 supported");
+    }
+    int scale = dstBits / srcBits;
+
+    // Adjust the number of elements to load when emulating narrow types,
+    // and then cast back to the original type with vector.bitcast op.
+    // For example, to emulate i4 to i8, the following op:
+    //
+    //   %mask = vector.constant_mask [3] : vector<6xi1>
+    //   %1 = vector.maskedload %0[%c0, %c0], %mask, %pass_thru :
+    //   memref<3x6xi4>, vector<6xi1>, vector<6xi4> into vector<6xi4>
+    //
+    // can be replaced with
+    //
+    //   %new_mask = vector.constant_mask [2] : vector<3xi1>
+    //   %new_pass_thru = vector.bitcast %pass_thru : vector<6xi4> to
+    //   vector<3xi8> %1 = vector.maskedload %0[%linear_index], %new_mask,
+    //   %new_pass_thru : memref<9xi8>, vector<3xi1>, vector<3xi8> into
+    //   vector<3xi8>
+    //
+    // Since we are effectively loading 16 bits (2xi8) from the memref with the
+    // new mask, while originally we only wanted to effectively load 12 bits
+    // (3xi4) from the memref, we need to set the second half of the last i8
+    // that was effectively loaded (i.e. the second i8) to 0.
----------------
dcaballe wrote:

I'm probably the one missing something :) I meant:

```
%i8ld = vector.maskedload %0[%linear_index], %new_mask, %new_pass_thru ...
%i4ld = vector.bitcast %i8ld : vector<3xi8> to vector<6xi4>
%final_ld = arith.select %mask, %i4ld, %pass_thru : vector<6xi1>, vector<6xi4>
```

Would this work? The generated asm may not be ideal but it should definitely reduce the complexity of this lowering.

https://github.com/llvm/llvm-project/pull/68443