[Mlir-commits] [mlir] [mlir][Vector] Add a rewrite pattern for better low-precision bitcast… (PR #66387)

Thu Sep 14 07:55:39 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-mlir-vector
            
<details>
<summary>Changes</summary>
…(trunci) expansion

This revision adds a rewrite for sequences of vector `bitcast(trunci)` to use a more efficient sequence of vector operations comprising `shuffle` and `bitwise` ops.

Such patterns appear naturally when writing quantization / dequantization functionality with the vector dialect.

The rewrite performs a simple enumeration of each of the bits in the result vector and determines its provenance in the pre-trunci vector. The enumeration is used to generate the proper sequence of `shuffle`, `andi`, `ori` followed by an optional final `trunci`/`extui`.

The rewrite currently only applies to 1-D non-scalable vectors and bails out if the final vector element type is not a multiple of 8. This is a failsafe heuristic determined empirically: if the resulting type is not an even number of bytes, further complexities arise that are not improved by this pattern: the heavy lifting still needs to be done by LLVM.
--

Patch is 23.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66387.diff

6 Files Affected:

- (modified) mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td (+13) 
- (modified) mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h (+14-1) 
- (modified) mlir/include/mlir/IR/BuiltinTypes.h (+10) 
- (modified) mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp (+5) 
- (modified) mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp (+186-4) 
- (added) mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir (+134) 


<pre>

diff --git a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
index 9e718a0c80bbf3b..133ee4e030f01e5 100644
--- a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
+++ b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
@@ -292,6 +292,19 @@ def ApplyLowerTransposePatternsOp : Op&lt;Transform_Dialect,
   }];
 }
 
+def ApplyRewriteNarrowTypePatternsOp : Op&lt;Transform_Dialect,
+    &quot;apply_patterns.vector.rewrite_narrow_types&quot;,
+    [DeclareOpInterfaceMethods&lt;PatternDescriptorOpInterface&gt;]&gt; {
+  let description = [{
+    Indicates that vector narrow rewrite operations should be applied.
+
+    This is usually a late step that is run after bufferization as part of the
+    process of lowering to e.g. LLVM or NVVM.
+  }];
+
+  let assemblyFormat = &quot;attr-dict&quot;;
+}
+
 def ApplySplitTransferFullPartialPatternsOp : Op&lt;Transform_Dialect,
     &quot;apply_patterns.vector.split_transfer_full_partial&quot;,
     [DeclareOpInterfaceMethods&lt;PatternDescriptorOpInterface&gt;]&gt; {
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
index c644090d8c78cd0..8652fc7f5e5c640 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h
@@ -24,6 +24,7 @@ class RewritePatternSet;
 
 namespace arith {
 class NarrowTypeEmulationConverter;
+class TruncIOp;
 } // namespace arith
 
 namespace vector {
@@ -143,7 +144,7 @@ void populateVectorTransferCollapseInnerMostContiguousDimsPatterns(
 
 /// Patterns that remove redundant vector broadcasts.
 void populateSinkVectorBroadcastPatterns(RewritePatternSet &amp;patterns,
-                                          PatternBenefit benefit = 1);
+                                         PatternBenefit benefit = 1);
 
 /// Populate `patterns` with the following patterns.
 ///
@@ -301,6 +302,18 @@ void populateVectorNarrowTypeEmulationPatterns(
     arith::NarrowTypeEmulationConverter &amp;typeConverter,
     RewritePatternSet &amp;patterns);
 
+/// Rewrite a vector `bitcast(trunci)` to use a more efficient sequence of
+/// vector operations comprising `shuffle` and `bitwise` ops.
+FailureOr&lt;Value&gt; rewriteBitCastOfTruncI(RewriterBase &amp;rewriter,
+                                        vector::BitCastOp bitCastOp,
+                                        arith::TruncIOp truncOp,
+                                        vector::BroadcastOp maybeBroadcastOp);
+
+/// Appends patterns for rewriting vector operations over narrow types with
+/// ops over wider types.
+void populateVectorNarrowTypeRewritePatterns(RewritePatternSet &amp;patterns,
+                                             PatternBenefit benefit = 1);
+
 } // namespace vector
 } // namespace mlir
 
diff --git a/mlir/include/mlir/IR/BuiltinTypes.h b/mlir/include/mlir/IR/BuiltinTypes.h
index f031eb0a5c30ce9..9df5548cd5d939c 100644
--- a/mlir/include/mlir/IR/BuiltinTypes.h
+++ b/mlir/include/mlir/IR/BuiltinTypes.h
@@ -357,6 +357,16 @@ class VectorType::Builder {
     return *this;
   }
 
+  /// Set a dim in shape @pos to val.
+  Builder &amp;setDim(unsigned pos, int64_t val) {
+    if (storage.empty())
+      storage.append(shape.begin(), shape.end());
+    assert(pos &lt; storage.size() &amp;&amp; &quot;overflow&quot;);
+    storage[pos] = val;
+    shape = {storage.data(), storage.size()};
+    return *this;
+  }
+
   operator VectorType() {
     return VectorType::get(shape, elementType, scalableDims);
   }
diff --git a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
index b388deaa46a7917..37127ea70f1e5af 100644
--- a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
+++ b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
@@ -159,6 +159,11 @@ void transform::ApplyLowerTransposePatternsOp::populatePatterns(
   }
 }
 
+void transform::ApplyRewriteNarrowTypePatternsOp::populatePatterns(
+    RewritePatternSet &amp;patterns) {
+  populateVectorNarrowTypeRewritePatterns(patterns);
+}
+
 void transform::ApplySplitTransferFullPartialPatternsOp::populatePatterns(
     RewritePatternSet &amp;patterns) {
   vector::VectorTransformsOptions vectorTransformOptions;
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index b2b7bfc5e4437c1..4d91aa2898f7ceb 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include &quot;mlir/Dialect/Affine/IR/AffineOps.h&quot;
 #include &quot;mlir/Dialect/Arith/IR/Arith.h&quot;
 #include &quot;mlir/Dialect/Arith/Transforms/NarrowTypeEmulationConverter.h&quot;
 #include &quot;mlir/Dialect/Arith/Utils/Utils.h&quot;
@@ -15,13 +14,23 @@
 #include &quot;mlir/Dialect/MemRef/Utils/MemRefUtils.h&quot;
 #include &quot;mlir/Dialect/Vector/IR/VectorOps.h&quot;
 #include &quot;mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h&quot;
+#include &quot;mlir/IR/BuiltinAttributes.h&quot;
+#include &quot;mlir/IR/BuiltinTypes.h&quot;
+#include &quot;mlir/IR/TypeUtilities.h&quot;
+#include &quot;mlir/IR/Value.h&quot;
 #include &quot;mlir/Transforms/DialectConversion.h&quot;
-#include &quot;llvm/Support/FormatVariadic.h&quot;
-#include &quot;llvm/Support/MathExtras.h&quot;
-#include &lt;cassert&gt;
+#include &quot;llvm/ADT/SmallVector.h&quot;
+#include &quot;llvm/Support/Debug.h&quot;
+#include &quot;llvm/Support/raw_ostream.h&quot;
+#include &lt;cstdint&gt;
 
 using namespace mlir;
 
+#define DEBUG_TYPE &quot;vector-narrow-type-emulation&quot;
+#define DBGS() (llvm::dbgs() &lt;&lt; &quot;[&quot; DEBUG_TYPE &quot;]: &quot;)
+#define DBGSNL() (llvm::dbgs() &lt;&lt; &quot;\n&quot;)
+#define LDBG(X) LLVM_DEBUG(DBGS() &lt;&lt; X &lt;&lt; &quot;\n&quot;)
+
 namespace {
 
 //===----------------------------------------------------------------------===//
@@ -155,6 +164,174 @@ struct ConvertVectorTransferRead final
 };
 } // end anonymous namespace
 
+//===----------------------------------------------------------------------===//
+// RewriteBitCastOfTruncI
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Helper struct to keep track of the provenance of a contiguous set of bits
+/// in a source vector.
+struct SourceElementRange {
+  int64_t sourceElement;
+  int64_t sourceBitBegin;
+  int64_t sourceBitEnd;
+};
+using SourceElementRangeList = SmallVector&lt;SourceElementRange&gt;;
+
+/// Helper struct to enumerate the source elements and bit ranges that are
+/// involved in a bitcast operation.
+/// This allows rewriting a vector.bitcast into shuffles and bitwise ops for
+/// any 1-D vector shape and any source/target bitwidths.
+struct BitCastBitsEnumerator {
+  BitCastBitsEnumerator(VectorType sourceVectorType,
+                        VectorType targetVectorType);
+
+  int64_t getNumVectorsNeeded() {
+    int64_t numVectors = 0;
+    for (const auto &amp;l : sourceElementRanges)
+      numVectors = std::max(numVectors, (int64_t)l.size());
+    return numVectors;
+  }
+
+  VectorType sourceVectorType;
+  VectorType targetVectorType;
+  SmallVector&lt;SourceElementRangeList&gt; sourceElementRanges;
+};
+
+} // namespace
+
+static raw_ostream &amp;operator&lt;&lt;(raw_ostream &amp;os,
+                               const SourceElementRangeList &amp;l) {
+  for (const auto &amp;s : l) {
+    os &lt;&lt; &quot;{ &quot; &lt;&lt; s.sourceElement &lt;&lt; &quot;: b@[&quot; &lt;&lt; s.sourceBitBegin &lt;&lt; &quot;..&quot;
+       &lt;&lt; s.sourceBitEnd &lt;&lt; &quot;) } &quot;;
+  }
+  return os;
+}
+
+static raw_ostream &amp;operator&lt;&lt;(raw_ostream &amp;os,
+                               const SmallVector&lt;SourceElementRangeList&gt; &amp;vec) {
+  for (const auto &amp;l : vec) {
+    for (const auto &amp;s : l) {
+      os &lt;&lt; &quot;{ &quot; &lt;&lt; s.sourceElement &lt;&lt; &quot;: b@[&quot; &lt;&lt; s.sourceBitBegin &lt;&lt; &quot;..&quot;
+         &lt;&lt; s.sourceBitEnd &lt;&lt; &quot;) } &quot;;
+    }
+    os &lt;&lt; &quot;\n&quot;;
+  }
+  return os;
+}
+
+BitCastBitsEnumerator::BitCastBitsEnumerator(VectorType sourceVectorType,
+                                             VectorType targetVectorType)
+    : sourceVectorType(sourceVectorType), targetVectorType(targetVectorType) {
+
+  assert(targetVectorType.getRank() == 1 &amp;&amp; !targetVectorType.isScalable() &amp;&amp;
+         &quot;requires -D non-scalable vector type&quot;);
+  int64_t sourceBitWidth = sourceVectorType.getElementTypeBitWidth();
+  int64_t mostMinorSourceDim = sourceVectorType.getShape().back();
+  LDBG(&quot;sourceVectorType: &quot; &lt;&lt; sourceVectorType);
+
+  int64_t targetBitWidth = targetVectorType.getElementTypeBitWidth();
+  int64_t mostMinorTargetDim = targetVectorType.getShape().back();
+  LDBG(&quot;targetVectorType: &quot; &lt;&lt; targetVectorType);
+
+  int64_t bitwidth = targetBitWidth * mostMinorTargetDim;
+  assert(bitwidth == sourceBitWidth * mostMinorSourceDim &amp;&amp;
+         &quot;source and target bitwidths must match&quot;);
+
+  // Prepopulate one source element range per target element.
+  sourceElementRanges = SmallVector&lt;SourceElementRangeList&gt;(mostMinorTargetDim);
+  for (int64_t resultBit = 0; resultBit &lt; bitwidth;) {
+    int64_t resultElement = resultBit / targetBitWidth;
+    int64_t resultBitInElement = resultBit % targetBitWidth;
+    int64_t sourceElement = resultBit / sourceBitWidth;
+    int64_t sourceBitInElement = resultBit % sourceBitWidth;
+    int64_t step = std::min(sourceBitWidth - sourceBitInElement,
+                            targetBitWidth - resultBitInElement);
+    sourceElementRanges[resultElement].push_back(
+        {sourceElement, sourceBitInElement, sourceBitInElement + step});
+    resultBit += step;
+  }
+}
+
+namespace {
+/// Rewrite bitcast(trunci) to a sequence of shuffles and bitwise ops that take
+/// advantage of high-level information to avoid leaving LLVM to scramble with
+/// peephole optimizations.
+struct RewriteBitCastOfTruncI : OpRewritePattern&lt;vector::BitCastOp&gt; {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::BitCastOp bitCastOp,
+                                PatternRewriter &amp;rewriter) const override {
+    // The source must be a trunc op.
+    auto truncOp =
+        bitCastOp.getSource().template getDefiningOp&lt;arith::TruncIOp&gt;();
+    if (!truncOp)
+      return rewriter.notifyMatchFailure(bitCastOp, &quot;not a trunci source&quot;);
+
+    VectorType targetVectorType = bitCastOp.getResultVectorType();
+    if (targetVectorType.getRank() != 1 || targetVectorType.isScalable())
+      return rewriter.notifyMatchFailure(bitCastOp, &quot;scalable or &gt;1-D vector&quot;);
+    // TODO: consider relaxing this restriction in the future if we find ways to
+    // really work with subbyte elements across the MLIR/LLVM boundary.
+    int64_t resultBitwidth = targetVectorType.getElementTypeBitWidth();
+    if (resultBitwidth % 8 != 0)
+      return rewriter.notifyMatchFailure(bitCastOp, &quot;bitwidth is not k * 8&quot;);
+
+    VectorType sourceVectorType = bitCastOp.getSourceVectorType();
+    BitCastBitsEnumerator be(sourceVectorType, targetVectorType);
+    LDBG(&quot;\n&quot; &lt;&lt; be.sourceElementRanges);
+
+    Value initialValue = truncOp.getIn();
+    auto initalVectorType = initialValue.getType().cast&lt;VectorType&gt;();
+    auto initalElementType = initalVectorType.getElementType();
+    auto initalElementBitWidth = initalElementType.getIntOrFloatBitWidth();
+    Value res;
+    // BitCastBitsEnumerator encodes for each element of the target vector the
+    // provenance of the bits in the source vector. We can &quot;transpose&quot; this
+    // information to build a sequence of shuffles and bitwise ops that will
+    // produce the desired result.
+    for (int64_t idx = 0, e = be.getNumVectorsNeeded(); idx &lt; e; ++idx) {
+      SmallVector&lt;int64_t&gt; shuffles;
+      SmallVector&lt;Attribute&gt; masks;
+      for (auto &amp;l : be.sourceElementRanges) {
+        int64_t bitLo = (idx &lt; (int64_t)l.size()) ? l[idx].sourceBitBegin : 0;
+        int64_t bitHi = (idx &lt; (int64_t)l.size()) ? l[idx].sourceBitEnd : 0;
+        int64_t sourceElement =
+            (idx &lt; (int64_t)l.size()) ? l[idx].sourceElement : 0;
+        shuffles.push_back(sourceElement);
+        IntegerAttr mask = IntegerAttr::get(
+            rewriter.getIntegerType(initalElementBitWidth),
+            llvm::APInt::getBitsSet(initalElementBitWidth, bitLo, bitHi));
+        masks.push_back(mask);
+      }
+      auto shuffleOp = rewriter.create&lt;vector::ShuffleOp&gt;(
+          bitCastOp.getLoc(), initialValue, initialValue, shuffles);
+      VectorType vt = VectorType::Builder(initalVectorType)
+                          .setDim(initalVectorType.getRank() - 1, masks.size());
+      auto constOp = rewriter.create&lt;arith::ConstantOp&gt;(
+          bitCastOp.getLoc(), DenseElementsAttr::get(vt, masks));
+      Value andValue = rewriter.create&lt;arith::AndIOp&gt;(bitCastOp.getLoc(),
+                                                      shuffleOp, constOp);
+      res =
+          res ? rewriter.create&lt;arith::OrIOp&gt;(bitCastOp.getLoc(), res, andValue)
+              : andValue;
+    }
+
+    bool narrowing = resultBitwidth &lt;= initalElementBitWidth;
+    if (narrowing) {
+      rewriter.replaceOpWithNewOp&lt;arith::TruncIOp&gt;(
+          bitCastOp, bitCastOp.getResultVectorType(), res);
+    } else {
+      rewriter.replaceOpWithNewOp&lt;arith::ExtUIOp&gt;(
+          bitCastOp, bitCastOp.getResultVectorType(), res);
+    }
+    return success();
+  }
+};
+} // namespace
+
 //===----------------------------------------------------------------------===//
 // Public Interface Definition
 //===----------------------------------------------------------------------===//
@@ -167,3 +344,8 @@ void vector::populateVectorNarrowTypeEmulationPatterns(
   patterns.add&lt;ConvertVectorLoad, ConvertVectorTransferRead&gt;(
       typeConverter, patterns.getContext());
 }
+
+void vector::populateVectorNarrowTypeRewritePatterns(
+    RewritePatternSet &amp;patterns, PatternBenefit benefit) {
+  patterns.add&lt;RewriteBitCastOfTruncI&gt;(patterns.getContext(), benefit);
+}
diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
new file mode 100644
index 000000000000000..1893bf732fe4c0a
--- /dev/null
+++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
@@ -0,0 +1,134 @@
+// RUN: mlir-opt %s --test-transform-dialect-interpreter --split-input-file | FileCheck %s
+
+/// Note: Inspect generated assembly and llvm-mca stats:
+/// ====================================================
+/// mlir-opt --test-transform-dialect-interpreter mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir -test-transform-dialect-erase-schedule -test-lower-to-llvm | mlir-translate -mlir-to-llvmir | llc -o - -mcpu=skylake-avx512 --function-sections -filetype=obj &gt; /tmp/a.out; objdump -d --disassemble=f1 --no-addresses --no-show-raw-insn -M att /tmp/a.out | ./build/bin/llvm-mca -mcpu=skylake-avx512
+
+// CHECK-LABEL: func.func @f1(
+//  CHECK-SAME: %[[A:[0-9a-z]*]]: vector&lt;32xi64&gt;) -&gt; vector&lt;20xi8&gt;
+func.func @f1(%a: vector&lt;32xi64&gt;) -&gt; vector&lt;20xi8&gt; {
+  /// Rewriting this standalone pattern is about 1.8x faster on skylake-ax512 according to llvm-mca.
+  /// Benefit further increases when mixed with other compute ops.
+  /// The provenance of the 20x8 bits of the result are the following bits in the
+  /// source vector:
+  ///   { 0: b@[0..5) } { 1: b@[0..3) }
+  ///   { 1: b@[3..5) } { 2: b@[0..5) } { 3: b@[0..1) }
+  ///   { 3: b@[1..5) } { 4: b@[0..4) }
+  ///   { 4: b@[4..5) } { 5: b@[0..5) } { 6: b@[0..2) }
+  ///   { 6: b@[2..5) } { 7: b@[0..5) }
+  ///   { 8: b@[0..5) } { 9: b@[0..3) }
+  ///   { 9: b@[3..5) } { 10: b@[0..5) } { 11: b@[0..1) }
+  ///   { 11: b@[1..5) } { 12: b@[0..4) }
+  ///   { 12: b@[4..5) } { 13: b@[0..5) } { 14: b@[0..2) }
+  ///   { 14: b@[2..5) } { 15: b@[0..5) }
+  ///   { 16: b@[0..5) } { 17: b@[0..3) }
+  ///   { 17: b@[3..5) } { 18: b@[0..5) } { 19: b@[0..1) }
+  ///   { 19: b@[1..5) } { 20: b@[0..4) }
+  ///   { 20: b@[4..5) } { 21: b@[0..5) } { 22: b@[0..2) }
+  ///   { 22: b@[2..5) } { 23: b@[0..5) }
+  ///   { 24: b@[0..5) } { 25: b@[0..3) }
+  ///   { 25: b@[3..5) } { 26: b@[0..5) } { 27: b@[0..1) }
+  ///   { 27: b@[1..5) } { 28: b@[0..4) }
+  ///   { 28: b@[4..5) } { 29: b@[0..5) } { 30: b@[0..2) }
+  ///   { 30: b@[2..5) } { 31: b@[0..5) }
+  /// This results in 3 shuffles + 3 and + 2 or.
+  /// The third vector is empty for positions 0, 2, 4, 5, 7, 9, 10, 12, 14, 15,
+  /// 17 and 19 (i.e. there are only 2 entries in that row).
+  /// 
+  ///                             0: b@[0..5), 1: b@[3..5), etc
+  // CHECK: %[[MASK0:.*]] = arith.constant dense&lt;[31, 24, 30, 16, 28, 31, 24, 30, 16, 28, 31, 24, 30, 16, 28, 31, 24, 30, 16, 28]&gt; : vector&lt;20xi64&gt;
+  ///                             1: b@[0..3), 2: b@[0..5), etc
+  // CHECK: %[[MASK1:.*]] = arith.constant dense&lt;[7, 31, 15, 31, 31, 7, 31, 15, 31, 31, 7, 31, 15, 31, 31, 7, 31, 15, 31, 31]&gt; :  vector&lt;20xi64&gt;
+  ///                             empty, 3: b@[0..1), empty etc
+  // CHECK: %[[MASK2:.*]] = arith.constant dense&lt;[0, 1, 0, 3, 0, 0, 1, 0, 3, 0, 0, 1, 0, 3, 0, 0, 1, 0, 3, 0]&gt; : vector&lt;20xi64&gt;
+  // CHECK: %[[V0:.*]] = vector.shuffle %[[A]], %[[A]] [0, 1, 3, 4, 6, 8, 9, 11, 12, 14, 16, 17, 19, 20, 22, 24, 25, 27, 28, 30] : vector&lt;32xi64&gt;, vector&lt;32xi64&gt;
+  // CHECK: %[[A0:.*]] = arith.andi %[[V0]], %[[MASK0]] : vector&lt;20xi64&gt;
+  // CHECK: %[[V1:.*]] = vector.shuffle %[[A]], %[[A]] [1, 2, 4, 5, 7, 9, 10, 12, 13, 15, 17, 18, 20, 21, 23, 25, 26, 28, 29, 31] : vector&lt;32xi64&gt;, vector&lt;32xi64&gt;
+  // CHECK: %[[A1:.*]] = arith.andi %[[V1]], %[[MASK1]] : vector&lt;20xi64&gt;
+  // CHECK: %[[O1:.*]] = arith.ori %[[A0]], %[[A1]] : vector&lt;20xi64&gt;
+  // CHECK: %[[V2:.*]] = vector.shuffle %[[A]], %[[A]] [0, 3, 0, 6, 0, 0, 11, 0, 14, 0, 0, 19, 0, 22, 0, 0, 27, 0, 30, 0] : vector&lt;32xi64&gt;, vector&lt;32xi64&gt;
+  // CHECK: %[[A2:.*]] = arith.andi %[[V2]], %[[MASK2]] : vector&lt;20xi64&gt;
+  // CHECK: %[[O2:.*]] = arith.ori %[[O1]], %[[A2]] : vector&lt;20xi64&gt;
+  // CHECK: %[[TR:.*]] = arith.trunci %[[O2]] : vector&l...
<truncated>
</pre>
</details>


https://github.com/llvm/llvm-project/pull/66387