[llvm] 93c9684 - [VectorCombine] New folding pattern for extract/binop/shuffle chains (#145232)

Sun Aug 24 01:51:51 PDT 2025

Author: Rajveer Singh Bharadwaj
Date: 2025-08-24T14:21:48+05:30
New Revision: 93c96849c89507579a77980ff03adbeabb413573

URL: https://github.com/llvm/llvm-project/commit/93c96849c89507579a77980ff03adbeabb413573
DIFF: https://github.com/llvm/llvm-project/commit/93c96849c89507579a77980ff03adbeabb413573.diff

LOG: [VectorCombine] New folding pattern for extract/binop/shuffle chains (#145232)

Resolves #144654
Part of #143088

This adds a new `foldShuffleChainsToReduce` for horizontal reduction of
patterns like:

```llvm
define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 {
  %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
  %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
  %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
  %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
  %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
  %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
  %7 = extractelement <8 x i16> %6, i64 0
  ret i16 %7
}
```
...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0)
intrinsic call.

Similar transformation for other ops when costs permit to do so.

Added: 
    llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll
    llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll

Modified: 
    llvm/include/llvm/Transforms/Utils/LoopUtils.h
    llvm/lib/Transforms/Utils/LoopUtils.cpp
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 723f6aea1b76f..96e3d3d47f2d0 100644

--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -371,6 +371,9 @@ LLVM_ABI bool canSinkOrHoistInst(Instruction &I, AAResults *AA,
 /// Returns the llvm.vector.reduce intrinsic that corresponds to the recurrence
 /// kind.
 LLVM_ABI constexpr Intrinsic::ID getReductionIntrinsicID(RecurKind RK);
+/// Returns the llvm.vector.reduce min/max intrinsic that corresponds to the
+/// intrinsic op.
+LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID);
 
 /// Returns the arithmetic instruction opcode used when expanding a reduction.
 LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID);

diff  --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 2d830f3b6f952..843364eb34f83 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -956,6 +956,21 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
   }
 }
 
+Intrinsic::ID llvm::getMinMaxReductionIntrinsicID(Intrinsic::ID IID) {
+  switch (IID) {
+  default:
+    llvm_unreachable("Unexpected intrinsic id");
+  case Intrinsic::umin:
+    return Intrinsic::vector_reduce_umin;
+  case Intrinsic::umax:
+    return Intrinsic::vector_reduce_umax;
+  case Intrinsic::smin:
+    return Intrinsic::vector_reduce_smin;
+  case Intrinsic::smax:
+    return Intrinsic::vector_reduce_smax;
+  }
+}
+
 // This is the inverse to getReductionForBinop
 unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) {
   switch (RdxID) {

diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 613dec121e016..221970f1ebb41 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -135,6 +135,7 @@ class VectorCombine {
   bool foldShuffleOfIntrinsics(Instruction &I);
   bool foldShuffleToIdentity(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
+  bool foldShuffleChainsToReduce(Instruction &I);
   bool foldCastFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
@@ -3136,6 +3137,267 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
   return MadeChanges;
 }
 
+/// For a given chain of patterns of the following form:
+///
+/// ```
+///   %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
+///
+///   %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
+///   ty1> %1)
+///     OR
+///   %2 = add/mul/or/and/xor <n x ty1> %0, %1
+///
+///   %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
+///   ...
+///   ...
+///   %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
+///   3), <n x ty1> %(i - 2)
+///     OR
+///   %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
+///
+///   %(i) = extractelement <n x ty1> %(i - 1), 0
+/// ```
+///
+/// Where:
+///    `mask` follows a partition pattern:
+///
+/// Ex:
+///    [n = 8, p = poison]
+///
+///    4 5 6 7 | p p p p
+///    2 3 | p p p p p p
+///    1 | p p p p p p p
+///
+///    For powers of 2, there's a consistent pattern, but for other cases
+///    the parity of the current half value at each step decides the
+///    next partition half (see `ExpectedParityMask` for more logical details
+///    in generalising this).
+///
+/// Ex:
+///    [n = 6]
+///
+///    3 4 5 | p p p
+///    1 2 | p p p p
+///    1 | p p p p p
+bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
+  // Going bottom-up for the pattern.
+  std::queue<Value *> InstWorklist;
+  InstructionCost OrigCost = 0;
+
+  // Common instruction operation after each shuffle op.
+  std::optional<unsigned int> CommonCallOp = std::nullopt;
+  std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
+
+  bool IsFirstCallOrBinInst = true;
+  bool ShouldBeCallOrBinInst = true;
+
+  // This stores the last used instructions for shuffle/common op.
+  //
+  // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
+  // instructions from either shuffle/common op.
+  SmallVector<Value *, 2> PrevVecV(2, nullptr);
+
+  Value *VecOpEE;
+  if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
+    return false;
+
+  auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
+  if (!FVT)
+    return false;
+
+  int64_t VecSize = FVT->getNumElements();
+  if (VecSize < 2)
+    return false;
+
+  // Number of levels would be ~log2(n), considering we always partition
+  // by half for this fold pattern.
+  unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
+  int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
+
+  // This is how we generalise for all element sizes.
+  // At each step, if vector size is odd, we need non-poison
+  // values to cover the dominant half so we don't miss out on any element.
+  //
+  // This mask will help us retrieve this as we go from bottom to top:
+  //
+  // Mask Set -> N = N * 2 - 1
+  // Mask Unset -> N = N * 2
+  for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
+       Cur = (Cur + 1) / 2, --Mask) {
+    if (Cur & 1)
+      ExpectedParityMask |= (1ll << Mask);
+  }
+
+  InstWorklist.push(VecOpEE);
+
+  while (!InstWorklist.empty()) {
+    Value *CI = InstWorklist.front();
+    InstWorklist.pop();
+
+    if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
+      if (!ShouldBeCallOrBinInst)
+        return false;
+
+      if (!IsFirstCallOrBinInst &&
+          any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
+        return false;
+
+      // For the first found call/bin op, the vector has to come from the
+      // extract element op.
+      if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
+        return false;
+      IsFirstCallOrBinInst = false;
+
+      if (!CommonCallOp)
+        CommonCallOp = II->getIntrinsicID();
+      if (II->getIntrinsicID() != *CommonCallOp)
+        return false;
+
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::umin:
+      case Intrinsic::umax:
+      case Intrinsic::smin:
+      case Intrinsic::smax: {
+        auto *Op0 = II->getOperand(0);
+        auto *Op1 = II->getOperand(1);
+        PrevVecV[0] = Op0;
+        PrevVecV[1] = Op1;
+        break;
+      }
+      default:
+        return false;
+      }
+      ShouldBeCallOrBinInst ^= 1;
+
+      IntrinsicCostAttributes ICA(
+          *CommonCallOp, II->getType(),
+          {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
+      OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
+
+      // We may need a swap here since it can be (a, b) or (b, a)
+      // and accordingly change as we go up.
+      if (!isa<ShuffleVectorInst>(PrevVecV[1]))
+        std::swap(PrevVecV[0], PrevVecV[1]);
+      InstWorklist.push(PrevVecV[1]);
+      InstWorklist.push(PrevVecV[0]);
+    } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
+      // Similar logic for bin ops.
+
+      if (!ShouldBeCallOrBinInst)
+        return false;
+
+      if (!IsFirstCallOrBinInst &&
+          any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
+        return false;
+
+      if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
+        return false;
+      IsFirstCallOrBinInst = false;
+
+      if (!CommonBinOp)
+        CommonBinOp = BinOp->getOpcode();
+
+      if (BinOp->getOpcode() != *CommonBinOp)
+        return false;
+
+      switch (*CommonBinOp) {
+      case BinaryOperator::Add:
+      case BinaryOperator::Mul:
+      case BinaryOperator::Or:
+      case BinaryOperator::And:
+      case BinaryOperator::Xor: {
+        auto *Op0 = BinOp->getOperand(0);
+        auto *Op1 = BinOp->getOperand(1);
+        PrevVecV[0] = Op0;
+        PrevVecV[1] = Op1;
+        break;
+      }
+      default:
+        return false;
+      }
+      ShouldBeCallOrBinInst ^= 1;
+
+      OrigCost +=
+          TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
+
+      if (!isa<ShuffleVectorInst>(PrevVecV[1]))
+        std::swap(PrevVecV[0], PrevVecV[1]);
+      InstWorklist.push(PrevVecV[1]);
+      InstWorklist.push(PrevVecV[0]);
+    } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
+      // We shouldn't have any null values in the previous vectors,
+      // is so, there was a mismatch in pattern.
+      if (ShouldBeCallOrBinInst ||
+          any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
+        return false;
+
+      if (SVInst != PrevVecV[1])
+        return false;
+
+      ArrayRef<int> CurMask;
+      if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
+                                   m_Mask(CurMask))))
+        return false;
+
+      // Subtract the parity mask when checking the condition.
+      for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
+        if (Mask < ShuffleMaskHalf &&
+            CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
+          return false;
+        if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
+          return false;
+      }
+
+      // Update mask values.
+      ShuffleMaskHalf *= 2;
+      ShuffleMaskHalf -= (ExpectedParityMask & 1);
+      ExpectedParityMask >>= 1;
+
+      OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                     SVInst->getType(), SVInst->getType(),
+                                     CurMask, CostKind);
+
+      VisitedCnt += 1;
+      if (!ExpectedParityMask && VisitedCnt == NumLevels)
+        break;
+
+      ShouldBeCallOrBinInst ^= 1;
+    } else {
+      return false;
+    }
+  }
+
+  // Pattern should end with a shuffle op.
+  if (ShouldBeCallOrBinInst)
+    return false;
+
+  assert(VecSize != -1 && "Expected Match for Vector Size");
+
+  Value *FinalVecV = PrevVecV[0];
+  if (!FinalVecV)
+    return false;
+
+  auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
+
+  Intrinsic::ID ReducedOp =
+      (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
+                    : getReductionForBinop(*CommonBinOp));
+  if (!ReducedOp)
+    return false;
+
+  IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
+  InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
+
+  if (NewCost >= OrigCost)
+    return false;
+
+  auto *ReducedResult =
+      Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
+  replaceValue(I, *ReducedResult);
+
+  return true;
+}
+
 /// Determine if its more efficient to fold:
 ///   reduce(trunc(x)) -> trunc(reduce(x)).
 ///   reduce(sext(x))  -> sext(reduce(x)).
@@ -4223,6 +4485,9 @@ bool VectorCombine::run() {
         if (foldCastFromReductions(I))
           return true;
         break;
+      case Instruction::ExtractElement:
+        if (foldShuffleChainsToReduce(I))
+          return true;
       case Instruction::ICmp:
       case Instruction::FCmp:
         if (foldExtractExtract(I))

diff  --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll
new file mode 100644
index 0000000000000..82b20ccc5b8f5
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll
@@ -0,0 +1,200 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes=vector-combine -S %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes=vector-combine -S %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes=vector-combine -S %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v4 -passes=vector-combine -S %s | FileCheck %s
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]])
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+  %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+  %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
+  %7 = extractelement <8 x i16> %6, i64 0
+  ret i16 %7
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+;
+; CHECK-LABEL: define i8 @test_reduce_v16i8(
+; CHECK-SAME: <16 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP8:%.*]] = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[A0]])
+; CHECK-NEXT:    ret i8 [[TMP8]]
+;
+  %1 = shufflevector <16 x i8> %a0, <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a0, <16 x i8> %1)
+  %3 = shufflevector <16 x i8> %2, <16 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %2, <16 x i8> %3)
+  %5 = shufflevector <16 x i8> %4, <16 x i8> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %4, <16 x i8> %5)
+  %7 = shufflevector <16 x i8> %6, <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %8 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %6, <16 x i8> %7)
+  %9 = extractelement <16 x i8> %8, i64 0
+  ret i8 %9
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; CHECK-LABEL: define i8 @test_reduce_v32i8(
+; CHECK-SAME: <32 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> [[A0]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = shufflevector <32 x i8> %a0, <32 x i8> poison, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+  i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %a0, <32 x i8> %1)
+  %3 = shufflevector <32 x i8> %2, <32 x i8> poison, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %2, <32 x i8> %3)
+  %5 = shufflevector <32 x i8> %4, <32 x i8> poison, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %4, <32 x i8> %5)
+  %7 = shufflevector <32 x i8> %6, <32 x i8> poison, <32 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %8 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %6, <32 x i8> %7)
+  %9 = shufflevector <32 x i8> %8, <32 x i8> poison, <32 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %8, <32 x i8> %9)
+  %11 = extractelement <32 x i8> %10, i64 0
+  ret i8 %11
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v16i16(
+; CHECK-SAME: <16 x i16> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> [[A0]])
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+  %1 = shufflevector <16 x i16> %a0, <16 x i16> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a0, <16 x i16> %1)
+  %3 = shufflevector <16 x i16> %2, <16 x i16> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %2, <16 x i16> %3)
+  %5 = shufflevector <16 x i16> %4, <16 x i16> poison, <16 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %4, <16 x i16> %5)
+  %7 = shufflevector <16 x i16> %6, <16 x i16> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %8 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %6, <16 x i16> %7)
+  %9 = extractelement <16 x i16> %8, i64 0
+  ret i16 %9
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; CHECK-LABEL: define i8 @test_reduce_v64i8(
+; CHECK-SAME: <64 x i8> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> [[A0]])
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+  %1 = shufflevector <64 x i8> %a0, <64 x i8> poison, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39,
+  i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47,
+  i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55,
+  i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %a0, <64 x i8> %1)
+  %3 = shufflevector <64 x i8> %2, <64 x i8> poison, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+  i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %2, <64 x i8> %3)
+  %5 = shufflevector <64 x i8> %4, <64 x i8> poison, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %4, <64 x i8> %5)
+  %7 = shufflevector <64 x i8> %6, <64 x i8> poison, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %8 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %6, <64 x i8> %7)
+  %9 = shufflevector <64 x i8> %8, <64 x i8> poison, <64 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %8, <64 x i8> %9)
+  %11 = shufflevector <64 x i8> %10, <64 x i8> poison, <64 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %12 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %10, <64 x i8> %11)
+  %13 = extractelement <64 x i8> %12, i64 0
+  ret i8 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v32i16(
+; CHECK-SAME: <32 x i16> [[A0:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> [[A0]])
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+  %1 = shufflevector <32 x i16> %a0, <32 x i16> poison, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+  i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %a0, <32 x i16> %1)
+  %3 = shufflevector <32 x i16> %2, <32 x i16> poison, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %2, <32 x i16> %3)
+  %5 = shufflevector <32 x i16> %4, <32 x i16> poison, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %4, <32 x i16> %5)
+  %7 = shufflevector <32 x i16> %6, <32 x i16> poison, <32 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %8 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %6, <32 x i16> %7)
+  %9 = shufflevector <32 x i16> %8, <32 x i16> poison, <32 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison,
+  i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %8, <32 x i16> %9)
+  %11 = extractelement <32 x i16> %10, i64 0
+  ret i16 %11
+}

diff  --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
new file mode 100644
index 0000000000000..403ce33b5344e
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll
@@ -0,0 +1,195 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]])
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+  %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+  %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+  %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
+  %7 = extractelement <8 x i16> %6, i64 0
+  ret i16 %7
+}
+
+define i16 @test_reduce_v7i16_or(<7 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v7i16_or(
+; CHECK-SAME: <7 x i16> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v7i16(<7 x i16> [[A0]])
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+  %1 = shufflevector <7 x i16> %a0, <7 x i16> poison, <7 x i32> <i32 3, i32 4, i32 5, i32 6, i32 poison, i32 poison, i32 poison>
+  %2 = or <7 x i16> %a0, %1
+  %3 = shufflevector <7 x i16> %2, <7 x i16> poison, <7 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = or <7 x i16> %2, %3
+  %5 = shufflevector <7 x i16> %4, <7 x i16> poison, <7 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = or <7 x i16> %4, %5
+  %7 = extractelement <7 x i16> %6, i64 0
+  ret i16 %7
+}
+
+define i16 @test_reduce_v3i16_and(<3 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v3i16_and(
+; CHECK-SAME: <3 x i16> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.and.v3i16(<3 x i16> [[A0]])
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+  %1 = shufflevector <3 x i16> %a0, <3 x i16> poison, <3 x i32> <i32 1, i32 2, i32 poison>
+  %2 = and <3 x i16> %a0, %1
+  %3 = shufflevector <3 x i16> %2, <3 x i16> poison, <3 x i32> <i32 1, i32 poison, i32 poison>
+  %4 = and <3 x i16> %2, %3
+  %5 = extractelement <3 x i16> %4, i64 0
+  ret i16 %5
+}
+
+define i16 @test_reduce_v6i16_xor(<6 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v6i16_xor(
+; CHECK-SAME: <6 x i16> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i16 @llvm.vector.reduce.xor.v6i16(<6 x i16> [[A0]])
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+  %1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> <i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison>
+  %2 = xor <6 x i16> %a0, %1
+  %3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = xor <6 x i16> %2, %3
+  %5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = xor <6 x i16> %4, %5
+  %7 = extractelement <6 x i16> %6, i64 0
+  ret i16 %7
+}
+
+define i16 @test_reduce_v8i16_2(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16_2(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]])
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP8]])
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP9]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i16> [[TMP16]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = tail call i16 @llvm.umin.i16(i16 [[TMP13]], i16 [[TMP14]])
+; CHECK-NEXT:    ret i16 [[TMP15]]
+;
+  %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+  %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+  %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
+  %7 = extractelement <8 x i16> %6, i64 0
+
+  %8 = shufflevector <8 x i16> %6, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+  %9 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %8)
+  %10 = shufflevector <8 x i16> %9, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %11 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %9, <8 x i16> %10)
+  %12 = shufflevector <8 x i16> %11, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %11, <8 x i16> %12)
+  %14 = extractelement <8 x i16> %13, i64 0
+
+  %15 = tail call i16 @llvm.umin.i16(i16 %7, i16 %14)
+
+  ret i16 %15
+}
+
+define i16 @test_reduce_v8i16_neg1(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16_neg1(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0
+; CHECK-NEXT:    ret i16 [[TMP7]]
+;
+  %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 1, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+  %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+  %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5)
+  %7 = extractelement <8 x i16> %6, i64 0
+  ret i16 %7
+}
+
+define i16 @test_reduce_v8i16_neg2(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16_neg2(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0
+; CHECK-NEXT:    ret i16 [[TMP7]]
+;
+  %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+  %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+  %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %4, <8 x i16> %5)
+  %7 = extractelement <8 x i16> %6, i64 0
+  ret i16 %7
+}
+
+define i16 @test_reduce_v8i16_neg3(<8 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v8i16_neg3(
+; CHECK-SAME: <8 x i16> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i64 0
+; CHECK-NEXT:    ret i16 [[TMP8]]
+;
+  %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+  %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1)
+  %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+  %5 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3)
+  %6 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %7 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %5, <8 x i16> %6)
+  %8 = extractelement <8 x i16> %7, i64 0
+  ret i16 %8
+}
+
+define i16 @test_reduce_v6i16_xor_neg(<6 x i16> %a0) {
+; CHECK-LABEL: define i16 @test_reduce_v6i16_xor_neg(
+; CHECK-SAME: <6 x i16> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <6 x i16> [[A0]], <6 x i16> poison, <6 x i32> <i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <6 x i16> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <6 x i16> [[TMP2]], <6 x i16> poison, <6 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <6 x i16> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <6 x i16> [[TMP4]], <6 x i16> poison, <6 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <6 x i16> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <6 x i16> [[TMP6]], i64 0
+; CHECK-NEXT:    ret i16 [[TMP7]]
+;
+  %1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> <i32 3, i32 4, i32 5, i32 poison, i32 poison, i32 poison>
+  %2 = xor <6 x i16> %a0, %1
+  %3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+  %4 = xor <6 x i16> %2, %3
+  %5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %6 = xor <6 x i16> %4, %5
+  %7 = extractelement <6 x i16> %6, i64 0
+  ret i16 %7
+}