[llvm] 100cb9a - [VectorCombine] Fold shuffle select pattern

Fri May 6 00:13:22 PDT 2022

Author: David Green
Date: 2022-05-06T08:13:18+01:00
New Revision: 100cb9a2ba9e30f3a2c058f349663271afc869e4

URL: https://github.com/llvm/llvm-project/commit/100cb9a2ba9e30f3a2c058f349663271afc869e4
DIFF: https://github.com/llvm/llvm-project/commit/100cb9a2ba9e30f3a2c058f349663271afc869e4.diff

LOG: [VectorCombine] Fold shuffle select pattern

This patch adds a combine to attempt to reduce the costs of certain
select-shuffle patterns. The form of code it attempts to detect is:
  %x = shuffle ...
  %y = shuffle ...
  %a = binop %x, %y
  %b = binop %x, %y
  shuffle %a, %b, selectmask

A classic select-mask will pick items from each lane of a or b. These
do not always have a great lowering on many architectures. This patch
attempts to pack a and b into the lower elements, creating a differently
ordered shuffle for reconstructing the orignal which may be better than
the select mask. This can be better for performance, especially if less
elements of a and b need to be computed and the input shuffles are
cheaper.

Because select-masks are just one form of shuffle, we generalize to any
mask. So long as the backend has decent costmodel for the shuffles, this
can generally improve things when they come up. For more basic cost
models the folds do not appear to be profitable, not getting past the
cost checks.

Differential Revision: https://reviews.llvm.org/D123911

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 82316a07771e..cf460907bc72 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -104,6 +104,7 @@ class VectorCombine {
   bool scalarizeLoadExtract(Instruction &I);
   bool foldShuffleOfBinops(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
+  bool foldSelectShuffle(Instruction &I);
 
   void replaceValue(Value &Old, Value &New) {
     Old.replaceAllUsesWith(&New);
@@ -1225,6 +1226,219 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
   return false;
 }
 
+/// This method looks for groups of shuffles acting on binops, of the form:
+///  %x = shuffle ...
+///  %y = shuffle ...
+///  %a = binop %x, %y
+///  %b = binop %x, %y
+///  shuffle %a, %b, selectmask
+/// We may, especially if the shuffle is wider than legal, be able to convert
+/// the shuffle to a form where only parts of a and b need to be computed. On
+/// architectures with no obvious "select" shuffle, this can reduce the total
+/// number of operations if the target reports them as cheaper.
+bool VectorCombine::foldSelectShuffle(Instruction &I) {
+  auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
+  auto *VT = dyn_cast<FixedVectorType>(I.getType());
+  if (!SVI || !VT)
+    return false;
+  auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
+  auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
+  if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
+      VT != Op0->getType())
+    return false;
+  auto *SVI0A = dyn_cast<ShuffleVectorInst>(Op0->getOperand(0));
+  auto *SVI0B = dyn_cast<ShuffleVectorInst>(Op0->getOperand(1));
+  auto *SVI1A = dyn_cast<ShuffleVectorInst>(Op1->getOperand(0));
+  auto *SVI1B = dyn_cast<ShuffleVectorInst>(Op1->getOperand(1));
+  auto checkSVNonOpUses = [&](Instruction *I) {
+    if (!I || I->getOperand(0)->getType() != VT)
+      return true;
+    return any_of(I->users(), [&](User *U) { return U != Op0 && U != Op1; });
+  };
+  if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
+      checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
+    return false;
+
+  // Collect all the uses that are shuffles that we can transform together. We
+  // may not have a single shuffle, but a group that can all be transformed
+  // together profitably.
+  SmallVector<ShuffleVectorInst *> Shuffles;
+  auto collectShuffles = [&](Instruction *I) {
+    for (auto *U : I->users()) {
+      auto *SV = dyn_cast<ShuffleVectorInst>(U);
+      if (!SV || SV->getType() != VT)
+        return false;
+      if (find(Shuffles, SV) == Shuffles.end())
+        Shuffles.push_back(SV);
+    }
+    return true;
+  };
+  if (!collectShuffles(Op0) || !collectShuffles(Op1))
+    return false;
+
+  // For each of the output shuffles, we try to sort all the first vector
+  // elements to the beginning, followed by the second array elements at the
+  // end. If the binops are legalized to smaller vectors, this may reduce total
+  // number of binops. We compute the ReconstructMask mask needed to convert
+  // back to the original lane order.
+  SmallVector<int> V1, V2;
+  SmallVector<SmallVector<int>> ReconstructMasks;
+  int MaxV1Elt = 0, MaxV2Elt = 0;
+  unsigned NumElts = VT->getNumElements();
+  for (ShuffleVectorInst *SVN : Shuffles) {
+    SmallVector<int> Mask;
+    SVN->getShuffleMask(Mask);
+
+    // Check the operands are the same as the original, or reversed (in which
+    // case we need to commute the mask).
+    Value *SVOp0 = SVN->getOperand(0);
+    Value *SVOp1 = SVN->getOperand(1);
+    if (SVOp0 == Op1 && SVOp1 == Op0) {
+      std::swap(SVOp0, SVOp1);
+      ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
+    }
+    if (SVOp0 != Op0 || SVOp1 != Op1)
+      return false;
+
+    // Calculate the reconstruction mask for this shuffle, as the mask needed to
+    // take the packed values from Op0/Op1 and reconstructing to the original
+    // order.
+    SmallVector<int> ReconstructMask;
+    for (unsigned I = 0; I < Mask.size(); I++) {
+      if (Mask[I] < 0) {
+        ReconstructMask.push_back(-1);
+      } else if (Mask[I] < static_cast<int>(NumElts)) {
+        MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
+        auto It = find(V1, Mask[I]);
+        if (It != V1.end())
+          ReconstructMask.push_back(It - V1.begin());
+        else {
+          ReconstructMask.push_back(V1.size());
+          V1.push_back(Mask[I]);
+        }
+      } else {
+        MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
+        auto It = find(V2, Mask[I] - NumElts);
+        if (It != V2.end())
+          ReconstructMask.push_back(NumElts + It - V2.begin());
+        else {
+          ReconstructMask.push_back(NumElts + V2.size());
+          V2.push_back(Mask[I] - NumElts);
+        }
+      }
+    }
+
+    ReconstructMasks.push_back(ReconstructMask);
+  }
+
+  // If the Maximum element used from V1 and V2 are not larger than the new
+  // vectors, the vectors are already packes and performing the optimization
+  // again will likely not help any further. This also prevents us from getting
+  // stuck in a cycle in case the costs do not also rule it out.
+  if (V1.empty() || V2.empty() ||
+      (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
+       MaxV2Elt == static_cast<int>(V2.size()) - 1))
+    return false;
+
+  // Calculate the masks needed for the new input shuffles, which get padded
+  // with undef
+  SmallVector<int> V1A, V1B, V2A, V2B;
+  for (unsigned I = 0; I < V1.size(); I++) {
+    V1A.push_back(SVI0A->getMaskValue(V1[I]));
+    V1B.push_back(SVI0B->getMaskValue(V1[I]));
+  }
+  for (unsigned I = 0; I < V2.size(); I++) {
+    V2A.push_back(SVI1A->getMaskValue(V2[I]));
+    V2B.push_back(SVI1B->getMaskValue(V2[I]));
+  }
+  while (V1A.size() < NumElts) {
+    V1A.push_back(UndefMaskElem);
+    V1B.push_back(UndefMaskElem);
+  }
+  while (V2A.size() < NumElts) {
+    V2A.push_back(UndefMaskElem);
+    V2B.push_back(UndefMaskElem);
+  }
+
+  auto AddShuffleCost = [&](InstructionCost C, ShuffleVectorInst *SV) {
+    return C +
+           TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, SV->getShuffleMask());
+  };
+  auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
+    return C + TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, VT, Mask);
+  };
+
+  // Get the costs of the shuffles + binops before and after with the new
+  // shuffle masks.
+  InstructionCost CostBefore =
+      TTI.getArithmeticInstrCost(Op0->getOpcode(), VT) +
+      TTI.getArithmeticInstrCost(Op1->getOpcode(), VT);
+  CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
+                                InstructionCost(0), AddShuffleCost);
+  // This set helps us only cost each unique shuffle once.
+  SmallPtrSet<ShuffleVectorInst *, 4> InputShuffles(
+      {SVI0A, SVI0B, SVI1A, SVI1B});
+  CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
+                                InstructionCost(0), AddShuffleCost);
+
+  // The new binops will be unused for lanes past the used shuffle lengths.
+  // These types attempt to get the correct cost for that from the target.
+  FixedVectorType *Op0SmallVT =
+      FixedVectorType::get(VT->getScalarType(), V1.size());
+  FixedVectorType *Op1SmallVT =
+      FixedVectorType::get(VT->getScalarType(), V2.size());
+  InstructionCost CostAfter =
+      TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT) +
+      TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT);
+  CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
+                               InstructionCost(0), AddShuffleMaskCost);
+  std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
+  CostAfter +=
+      std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
+                      InstructionCost(0), AddShuffleMaskCost);
+
+  if (CostBefore <= CostAfter)
+    return false;
+
+  // The cost model has passed, create the new instructions.
+  Builder.SetInsertPoint(SVI0A);
+  Value *NSV0A = Builder.CreateShuffleVector(SVI0A->getOperand(0),
+                                             SVI0A->getOperand(1), V1A);
+  Builder.SetInsertPoint(SVI0B);
+  Value *NSV0B = Builder.CreateShuffleVector(SVI0B->getOperand(0),
+                                             SVI0B->getOperand(1), V1B);
+  Builder.SetInsertPoint(SVI1A);
+  Value *NSV1A = Builder.CreateShuffleVector(SVI1A->getOperand(0),
+                                             SVI1A->getOperand(1), V2A);
+  Builder.SetInsertPoint(SVI1B);
+  Value *NSV1B = Builder.CreateShuffleVector(SVI1B->getOperand(0),
+                                             SVI1B->getOperand(1), V2B);
+  Builder.SetInsertPoint(Op0);
+  Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
+                                    NSV0A, NSV0B);
+  if (auto *I = dyn_cast<Instruction>(NOp0))
+    I->copyIRFlags(Op0, true);
+  Builder.SetInsertPoint(Op1);
+  Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
+                                    NSV1A, NSV1B);
+  if (auto *I = dyn_cast<Instruction>(NOp1))
+    I->copyIRFlags(Op1, true);
+
+  for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
+    Builder.SetInsertPoint(Shuffles[S]);
+    Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
+    replaceValue(*Shuffles[S], *NSV);
+  }
+
+  Worklist.pushValue(NSV0A);
+  Worklist.pushValue(NSV0B);
+  Worklist.pushValue(NSV1A);
+  Worklist.pushValue(NSV1B);
+  for (auto *S : Shuffles)
+    Worklist.add(S);
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager 
diff erences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -1245,6 +1459,7 @@ bool VectorCombine::run() {
       MadeChange |= foldExtractedCmps(I);
       MadeChange |= foldShuffleOfBinops(I);
       MadeChange |= foldShuffleFromReductions(I);
+      MadeChange |= foldSelectShuffle(I);
     }
     MadeChange |= scalarizeBinopOrCmp(I);
     MadeChange |= scalarizeLoadExtract(I);

diff  --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
index 28fac19b465e..3d6507a6e1f9 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
@@ -128,11 +128,13 @@ define <16 x i32> @test2_1_ins(<16 x i32> %x1, <16 x i32> %x2) {
 
 define <16 x i32> @test2_2(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test2_2(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 29, i32 26, i32 7, i32 4, i32 3, i32 6, i32 5, i32 2, i32 9, i32 8, i32 17, i32 28, i32 27, i32 16, i32 31, i32 30>
-; CHECK-NEXT:    [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]]
-; CHECK-NEXT:    [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]]
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> <i32 31, i32 28, i32 25, i32 22, i32 5, i32 4, i32 19, i32 8, i32 7, i32 18, i32 11, i32 10, i32 17, i32 14, i32 13, i32 16>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 21, i32 20, i32 24, i32 23, i32 11, i32 10, i32 14, i32 13, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 15, i32 12, i32 25, i32 22, i32 19, i32 18, i32 1, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 6, i32 3, i32 9, i32 2, i32 28, i32 17, i32 31, i32 16, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 30, i32 27, i32 8, i32 5, i32 4, i32 7, i32 26, i32 29, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 20, i32 2, i32 3, i32 21, i32 4, i32 5, i32 22, i32 6, i32 7, i32 23>
 ; CHECK-NEXT:    ret <16 x i32> [[S3]]
 ;
   %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -166,11 +168,13 @@ define <16 x i32> @test2_12(<16 x i32> %x, <16 x i32> %y) {
 
 define <16 x i32> @test3_1(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test3_1(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 23, i32 27, i32 31, i32 6, i32 10, i32 14, i32 21, i32 25, i32 29, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 18, i32 3, i32 16, i32 1, i32 5, i32 9, i32 13, i32 20, i32 24, i32 28, i32 7, i32 11, i32 15, i32 22, i32 26, i32 30>
-; CHECK-NEXT:    [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]]
-; CHECK-NEXT:    [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]]
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 17, i32 21, i32 25, i32 29, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 2, i32 19, i32 23, i32 27, i32 31, i32 6, i32 10, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 18, i32 3, i32 7, i32 11, i32 15, i32 22, i32 26, i32 30, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 16, i32 1, i32 5, i32 9, i32 13, i32 20, i32 24, i32 28, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <16 x i32> [[S3]]
 ;
   %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 23, i32 27, i32 31, i32 6, i32 10, i32 14, i32 21, i32 25, i32 29, i32 4, i32 8, i32 12>
@@ -183,11 +187,13 @@ define <16 x i32> @test3_1(<16 x i32> %x, <16 x i32> %y) {
 
 define <16 x i32> @test3_2(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test3_2(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 23, i32 27, i32 31, i32 6, i32 10, i32 14, i32 21, i32 25, i32 29, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 18, i32 3, i32 16, i32 1, i32 5, i32 9, i32 13, i32 20, i32 24, i32 28, i32 7, i32 11, i32 15, i32 22, i32 26, i32 30>
-; CHECK-NEXT:    [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]]
-; CHECK-NEXT:    [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]]
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> <i32 29, i32 26, i32 7, i32 4, i32 3, i32 6, i32 5, i32 2, i32 9, i32 8, i32 17, i32 28, i32 27, i32 16, i32 31, i32 30>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 6, i32 23, i32 19, i32 31, i32 27, i32 2, i32 14, i32 10, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 4, i32 21, i32 17, i32 29, i32 25, i32 0, i32 12, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 20, i32 5, i32 1, i32 13, i32 9, i32 16, i32 28, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 22, i32 7, i32 3, i32 15, i32 11, i32 18, i32 30, i32 26, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    ret <16 x i32> [[S3]]
 ;
   %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 23, i32 27, i32 31, i32 6, i32 10, i32 14, i32 21, i32 25, i32 29, i32 4, i32 8, i32 12>
@@ -222,12 +228,14 @@ define <16 x i32> @test3_12(<16 x i32> %x, <16 x i32> %y) {
 
 define <16 x i32> @test23(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @test23(
-; CHECK-NEXT:    [[S10:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[S20:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 29, i32 26, i32 7, i32 4, i32 3, i32 6, i32 5, i32 2, i32 9, i32 8, i32 17, i32 28, i32 27, i32 16, i32 31, i32 30>
-; CHECK-NEXT:    [[A0:%.*]] = add nsw <16 x i32> [[S10]], [[S20]]
-; CHECK-NEXT:    [[B0:%.*]] = sub nsw <16 x i32> [[S10]], [[S20]]
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[A0]], <16 x i32> [[B0]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 23, i32 27, i32 31, i32 6, i32 10, i32 14, i32 21, i32 25, i32 29, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[B0]], <16 x i32> [[A0]], <16 x i32> <i32 18, i32 3, i32 16, i32 1, i32 5, i32 9, i32 13, i32 20, i32 24, i32 28, i32 7, i32 11, i32 15, i32 22, i32 26, i32 30>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 18, i32 0, i32 20, i32 24, i32 12, i32 22, i32 10, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 19, i32 1, i32 21, i32 25, i32 13, i32 23, i32 11, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 7, i32 29, i32 3, i32 9, i32 27, i32 5, i32 17, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[X]], <16 x i32> <i32 4, i32 26, i32 6, i32 8, i32 16, i32 2, i32 28, i32 30, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 1, i32 17, i32 0, i32 16, i32 21, i32 22, i32 23, i32 5, i32 6, i32 7, i32 18, i32 19, i32 20, i32 2, i32 3, i32 4>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 18, i32 19, i32 20, i32 2, i32 3, i32 4, i32 21, i32 22, i32 23, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]]
 ; CHECK-NEXT:    [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]]
 ; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -251,11 +259,13 @@ define <16 x i32> @test23(<16 x i32> %x, <16 x i32> %y) {
 
 define <16 x i32> @testgood(<16 x i32> %x, <16 x i32> %y) {
 ; CHECK-LABEL: @testgood(
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
-; CHECK-NEXT:    [[A:%.*]] = add nsw <16 x i32> [[S1]], [[S2]]
-; CHECK-NEXT:    [[B:%.*]] = sub nsw <16 x i32> [[S1]], [[S2]]
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[A]], <16 x i32> [[B]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <16 x i32> <i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <16 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 ; CHECK-NEXT:    ret <16 x i32> [[S3]]
 ;
   %s1 = shufflevector <16 x i32> %x, <16 x i32> %y, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
@@ -361,14 +371,15 @@ define void @test_31(ptr %src, ptr %dst) {
 
 define <16 x i32> @test_1651256324(<16 x i32> %l0, <16 x i32> %l1, <16 x i32> %l6, <16 x i32> %l7) {
 ; CHECK-LABEL: @test_1651256324(
-; CHECK-NEXT:    [[S0:%.*]] = shufflevector <16 x i32> [[L0:%.*]], <16 x i32> [[L6:%.*]], <16 x i32> <i32 1, i32 20, i32 15, i32 3, i32 1, i32 10, i32 17, i32 25, i32 29, i32 23, i32 20, i32 10, i32 0, i32 20, i32 30, i32 30>
-; CHECK-NEXT:    [[S1:%.*]] = shufflevector <16 x i32> [[L1:%.*]], <16 x i32> [[L1]], <16 x i32> <i32 11, i32 22, i32 1, i32 7, i32 20, i32 0, i32 2, i32 24, i32 28, i32 10, i32 31, i32 12, i32 22, i32 5, i32 11, i32 4>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[L0:%.*]], <16 x i32> [[L6:%.*]], <16 x i32> <i32 30, i32 10, i32 1, i32 20, i32 10, i32 0, i32 20, i32 3, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[L1:%.*]], <16 x i32> [[L1]], <16 x i32> <i32 24, i32 1, i32 10, i32 0, i32 5, i32 7, i32 11, i32 11, i32 4, i32 12, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[L1]], <16 x i32> [[L1]], <16 x i32> <i32 24, i32 1, i32 10, i32 0, i32 5, i32 7, i32 11, i32 11, i32 4, i32 12, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i32> [[L7:%.*]], <16 x i32> [[L7]], <16 x i32> <i32 23, i32 20, i32 29, i32 25, i32 14, i32 21, i32 11, i32 9, i32 2, i32 7, i32 5, i32 15, i32 24, i32 30, i32 26, i32 5>
-; CHECK-NEXT:    [[S3:%.*]] = shufflevector <16 x i32> [[L6]], <16 x i32> [[L7]], <16 x i32> <i32 29, i32 19, i32 15, i32 30, i32 13, i32 0, i32 30, i32 23, i32 26, i32 3, i32 15, i32 24, i32 29, i32 8, i32 4, i32 0>
-; CHECK-NEXT:    [[ADD:%.*]] = add <16 x i32> [[S1]], [[S1]]
-; CHECK-NEXT:    [[SUB:%.*]] = sub <16 x i32> [[S0]], [[S3]]
-; CHECK-NEXT:    [[T0:%.*]] = shufflevector <16 x i32> [[ADD]], <16 x i32> [[SUB]], <16 x i32> <i32 20, i32 0, i32 29, i32 2, i32 15, i32 2, i32 11, i32 7, i32 14, i32 14, i32 19, i32 30, i32 20, i32 1, i32 18, i32 28>
-; CHECK-NEXT:    [[T1:%.*]] = shufflevector <16 x i32> [[SUB]], <16 x i32> [[ADD]], <16 x i32> <i32 23, i32 18, i32 25, i32 14, i32 11, i32 21, i32 29, i32 21, i32 4, i32 10, i32 21, i32 19, i32 18, i32 5, i32 12, i32 30>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[L6]], <16 x i32> [[L7]], <16 x i32> <i32 4, i32 24, i32 13, i32 15, i32 0, i32 29, i32 8, i32 30, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = sub <16 x i32> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[T0:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 18, i32 7, i32 22, i32 1, i32 8, i32 1, i32 9, i32 0, i32 6, i32 6, i32 23, i32 16, i32 18, i32 10, i32 24, i32 21>
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 16, i32 17, i32 3, i32 4, i32 3, i32 18, i32 19, i32 3, i32 5, i32 1, i32 20, i32 21, i32 6>
 ; CHECK-NEXT:    [[R:%.*]] = xor <16 x i32> [[T0]], [[T1]]
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
@@ -495,27 +506,29 @@ define dso_local i32 @full(i8* nocapture noundef readonly %p1, i32 noundef %st1,
 ; CHECK-NEXT:    [[REORDER:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:    [[TMP60:%.*]] = add nsw <16 x i32> [[TMP59]], [[REORDER]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP59]], [[REORDER]]
-; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 22, i32 18, i32 26, i32 30, i32 5, i32 1, i32 9, i32 13, i32 20, i32 16, i32 24, i32 28>
-; CHECK-NEXT:    [[REORDER1:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 20, i32 16, i32 24, i32 28, i32 7, i32 3, i32 11, i32 15, i32 22, i32 18, i32 26, i32 30>
-; CHECK-NEXT:    [[TMP63:%.*]] = add nsw <16 x i32> [[TMP62]], [[REORDER1]]
-; CHECK-NEXT:    [[TMP64:%.*]] = sub nsw <16 x i32> [[TMP62]], [[REORDER1]]
-; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> [[TMP64]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[REORDER2:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> [[TMP64]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 25, i32 24, i32 27, i32 26, i32 29, i32 28, i32 31, i32 30>
-; CHECK-NEXT:    [[TMP66:%.*]] = add nsw <16 x i32> [[TMP65]], [[REORDER2]]
-; CHECK-NEXT:    [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP65]], [[REORDER2]]
-; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 24, i32 9, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
-; CHECK-NEXT:    [[REORDER3:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 2, i32 19, i32 0, i32 17, i32 23, i32 6, i32 5, i32 20, i32 27, i32 10, i32 9, i32 24, i32 31, i32 14, i32 13, i32 28>
-; CHECK-NEXT:    [[TMP69:%.*]] = add nsw <16 x i32> [[TMP68]], [[REORDER3]]
-; CHECK-NEXT:    [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP68]], [[REORDER3]]
-; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP72:%.*]] = lshr <16 x i32> [[TMP71]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP73:%.*]] = and <16 x i32> [[TMP72]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP74:%.*]] = mul nuw <16 x i32> [[TMP73]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP75:%.*]] = add <16 x i32> [[TMP74]], [[TMP71]]
-; CHECK-NEXT:    [[TMP76:%.*]] = xor <16 x i32> [[TMP75]], [[TMP74]]
-; CHECK-NEXT:    [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP76]])
-; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP77]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP77]], 16
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 7, i32 3, i32 15, i32 11, i32 18, i32 22, i32 30, i32 26, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 1, i32 5, i32 13, i32 9, i32 16, i32 20, i32 28, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 5, i32 1, i32 13, i32 9, i32 16, i32 20, i32 28, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP65:%.*]] = shufflevector <16 x i32> [[TMP60]], <16 x i32> [[TMP61]], <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 18, i32 22, i32 30, i32 26, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP66:%.*]] = add nsw <16 x i32> [[TMP62]], [[TMP64]]
+; CHECK-NEXT:    [[TMP67:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP65]]
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22>
+; CHECK-NEXT:    [[REORDER2:%.*]] = shufflevector <16 x i32> [[TMP66]], <16 x i32> [[TMP67]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP69:%.*]] = add nsw <16 x i32> [[TMP68]], [[REORDER2]]
+; CHECK-NEXT:    [[TMP70:%.*]] = sub nsw <16 x i32> [[TMP68]], [[REORDER2]]
+; CHECK-NEXT:    [[TMP71:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 24, i32 9, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
+; CHECK-NEXT:    [[REORDER3:%.*]] = shufflevector <16 x i32> [[TMP69]], <16 x i32> [[TMP70]], <16 x i32> <i32 2, i32 19, i32 0, i32 17, i32 23, i32 6, i32 5, i32 20, i32 27, i32 10, i32 9, i32 24, i32 31, i32 14, i32 13, i32 28>
+; CHECK-NEXT:    [[TMP72:%.*]] = add nsw <16 x i32> [[TMP71]], [[REORDER3]]
+; CHECK-NEXT:    [[TMP73:%.*]] = sub nsw <16 x i32> [[TMP71]], [[REORDER3]]
+; CHECK-NEXT:    [[TMP74:%.*]] = shufflevector <16 x i32> [[TMP72]], <16 x i32> [[TMP73]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP75:%.*]] = lshr <16 x i32> [[TMP74]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP76:%.*]] = and <16 x i32> [[TMP75]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP77:%.*]] = mul nuw <16 x i32> [[TMP76]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP78:%.*]] = add <16 x i32> [[TMP77]], [[TMP74]]
+; CHECK-NEXT:    [[TMP79:%.*]] = xor <16 x i32> [[TMP78]], [[TMP77]]
+; CHECK-NEXT:    [[TMP80:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP79]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP80]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP80]], 16
 ; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]
@@ -700,27 +713,29 @@ define i32 @full_reorder(ptr nocapture noundef readonly %pix1, i32 noundef %i_pi
 ; CHECK-NEXT:    [[REORDER:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:    [[TMP52:%.*]] = add nsw <16 x i32> [[TMP51]], [[REORDER]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = sub nsw <16 x i32> [[TMP51]], [[REORDER]]
-; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 22, i32 18, i32 26, i32 30, i32 5, i32 1, i32 9, i32 13, i32 20, i32 16, i32 24, i32 28>
-; CHECK-NEXT:    [[REORDER191:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 20, i32 16, i32 24, i32 28, i32 7, i32 3, i32 11, i32 15, i32 22, i32 18, i32 26, i32 30>
-; CHECK-NEXT:    [[TMP55:%.*]] = add nsw <16 x i32> [[TMP54]], [[REORDER191]]
-; CHECK-NEXT:    [[TMP56:%.*]] = sub nsw <16 x i32> [[TMP54]], [[REORDER191]]
-; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> [[TMP56]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[REORDER192:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> [[TMP56]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 25, i32 24, i32 27, i32 26, i32 29, i32 28, i32 31, i32 30>
-; CHECK-NEXT:    [[TMP58:%.*]] = add nsw <16 x i32> [[TMP57]], [[REORDER192]]
-; CHECK-NEXT:    [[TMP59:%.*]] = sub nsw <16 x i32> [[TMP57]], [[REORDER192]]
-; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 24, i32 9, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
-; CHECK-NEXT:    [[REORDER193:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> <i32 2, i32 19, i32 0, i32 17, i32 23, i32 6, i32 5, i32 20, i32 27, i32 10, i32 9, i32 24, i32 31, i32 14, i32 13, i32 28>
-; CHECK-NEXT:    [[TMP61:%.*]] = add nsw <16 x i32> [[TMP60]], [[REORDER193]]
-; CHECK-NEXT:    [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP60]], [[REORDER193]]
-; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP64:%.*]] = lshr <16 x i32> [[TMP63]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP65:%.*]] = and <16 x i32> [[TMP64]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
-; CHECK-NEXT:    [[TMP66:%.*]] = mul nuw <16 x i32> [[TMP65]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP67:%.*]] = add <16 x i32> [[TMP66]], [[TMP63]]
-; CHECK-NEXT:    [[TMP68:%.*]] = xor <16 x i32> [[TMP67]], [[TMP66]]
-; CHECK-NEXT:    [[TMP69:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP68]])
-; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP69]], 65535
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP69]], 16
+; CHECK-NEXT:    [[TMP54:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> <i32 7, i32 3, i32 15, i32 11, i32 18, i32 22, i32 30, i32 26, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> <i32 1, i32 5, i32 13, i32 9, i32 16, i32 20, i32 28, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> <i32 5, i32 1, i32 13, i32 9, i32 16, i32 20, i32 28, i32 24, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <16 x i32> [[TMP52]], <16 x i32> [[TMP53]], <16 x i32> <i32 3, i32 7, i32 15, i32 11, i32 18, i32 22, i32 30, i32 26, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP58:%.*]] = add nsw <16 x i32> [[TMP54]], [[TMP56]]
+; CHECK-NEXT:    [[TMP59:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP57]]
+; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 17, i32 16, i32 19, i32 18, i32 21, i32 20, i32 23, i32 22>
+; CHECK-NEXT:    [[REORDER192:%.*]] = shufflevector <16 x i32> [[TMP58]], <16 x i32> [[TMP59]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP61:%.*]] = add nsw <16 x i32> [[TMP60]], [[REORDER192]]
+; CHECK-NEXT:    [[TMP62:%.*]] = sub nsw <16 x i32> [[TMP60]], [[REORDER192]]
+; CHECK-NEXT:    [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 24, i32 9, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
+; CHECK-NEXT:    [[REORDER193:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> <i32 2, i32 19, i32 0, i32 17, i32 23, i32 6, i32 5, i32 20, i32 27, i32 10, i32 9, i32 24, i32 31, i32 14, i32 13, i32 28>
+; CHECK-NEXT:    [[TMP64:%.*]] = add nsw <16 x i32> [[TMP63]], [[REORDER193]]
+; CHECK-NEXT:    [[TMP65:%.*]] = sub nsw <16 x i32> [[TMP63]], [[REORDER193]]
+; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <16 x i32> [[TMP64]], <16 x i32> [[TMP65]], <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 4, i32 5, i32 22, i32 23, i32 8, i32 9, i32 26, i32 27, i32 12, i32 13, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP67:%.*]] = lshr <16 x i32> [[TMP66]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP68:%.*]] = and <16 x i32> [[TMP67]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
+; CHECK-NEXT:    [[TMP69:%.*]] = mul nuw <16 x i32> [[TMP68]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
+; CHECK-NEXT:    [[TMP70:%.*]] = add <16 x i32> [[TMP69]], [[TMP66]]
+; CHECK-NEXT:    [[TMP71:%.*]] = xor <16 x i32> [[TMP70]], [[TMP69]]
+; CHECK-NEXT:    [[TMP72:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP71]])
+; CHECK-NEXT:    [[CONV118:%.*]] = and i32 [[TMP72]], 65535
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[TMP72]], 16
 ; CHECK-NEXT:    [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]]
 ; CHECK-NEXT:    [[SHR120:%.*]] = lshr i32 [[ADD119]], 1
 ; CHECK-NEXT:    ret i32 [[SHR120]]