[llvm] 9f7d415 - [X86] Move combineLoopMAddPattern and combineLoopSADPattern to an IR pass before SelecitonDAG.

Thu Mar 26 14:10:33 PDT 2020

Author: Craig Topper
Date: 2020-03-26T14:10:20-07:00
New Revision: 9f7d4150b9ec638a048c183c21a355195fdc4942

URL: https://github.com/llvm/llvm-project/commit/9f7d4150b9ec638a048c183c21a355195fdc4942
DIFF: https://github.com/llvm/llvm-project/commit/9f7d4150b9ec638a048c183c21a355195fdc4942.diff

LOG: [X86] Move combineLoopMAddPattern and combineLoopSADPattern to an IR pass before SelecitonDAG.

These transforms rely on a vector reduction flag on the SDNode
set by SelectionDAGBuilder. This flag exists because SelectionDAG
can't see across basic blocks so SelectionDAGBuilder is looking
across and saving the info. X86 is the only target that uses this
flag currently. By removing the X86 code we can remove the flag
and the SelectionDAGBuilder code.

This pass adds a dedicated IR pass for X86 that looks across the
blocks and transforms the IR into a form that the X86 SelectionDAG
can finish.

An advantage of this new approach is that we can enhance it to
shrink the phi nodes and final reduction tree based on the zeroes
that we need to concatenate to bring the partially reduced
reduction back up to the original width.

Differential Revision: https://reviews.llvm.org/D76649

Added: 
    llvm/lib/Target/X86/X86PartialReduction.cpp

Modified: 
    llvm/include/llvm/CodeGen/SelectionDAGNodes.h
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
    llvm/lib/Target/X86/CMakeLists.txt
    llvm/lib/Target/X86/X86.h
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/lib/Target/X86/X86TargetMachine.cpp
    llvm/test/CodeGen/X86/O3-pipeline.ll
    llvm/test/CodeGen/X86/madd.ll
    llvm/test/CodeGen/X86/min-legal-vector-width.ll
    llvm/test/CodeGen/X86/sad.ll

Removed: 
    llvm/test/CodeGen/Generic/vector-redux.ll


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index faee8e8ffa17..17b66363bb07 100644

--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -368,7 +368,6 @@ struct SDNodeFlags {
   bool NoInfs : 1;
   bool NoSignedZeros : 1;
   bool AllowReciprocal : 1;
-  bool VectorReduction : 1;
   bool AllowContract : 1;
   bool ApproximateFuncs : 1;
   bool AllowReassociation : 1;
@@ -385,7 +384,7 @@ struct SDNodeFlags {
   SDNodeFlags()
       : AnyDefined(false), NoUnsignedWrap(false), NoSignedWrap(false),
         Exact(false), NoNaNs(false), NoInfs(false),
-        NoSignedZeros(false), AllowReciprocal(false), VectorReduction(false),
+        NoSignedZeros(false), AllowReciprocal(false),
         AllowContract(false), ApproximateFuncs(false),
         AllowReassociation(false), NoFPExcept(false) {}
 
@@ -434,10 +433,6 @@ struct SDNodeFlags {
     setDefined();
     AllowReciprocal = b;
   }
-  void setVectorReduction(bool b) {
-    setDefined();
-    VectorReduction = b;
-  }
   void setAllowContract(bool b) {
     setDefined();
     AllowContract = b;
@@ -463,7 +458,6 @@ struct SDNodeFlags {
   bool hasNoInfs() const { return NoInfs; }
   bool hasNoSignedZeros() const { return NoSignedZeros; }
   bool hasAllowReciprocal() const { return AllowReciprocal; }
-  bool hasVectorReduction() const { return VectorReduction; }
   bool hasAllowContract() const { return AllowContract; }
   bool hasApproximateFuncs() const { return ApproximateFuncs; }
   bool hasAllowReassociation() const { return AllowReassociation; }
@@ -481,7 +475,6 @@ struct SDNodeFlags {
     NoInfs &= Flags.NoInfs;
     NoSignedZeros &= Flags.NoSignedZeros;
     AllowReciprocal &= Flags.AllowReciprocal;
-    VectorReduction &= Flags.VectorReduction;
     AllowContract &= Flags.AllowContract;
     ApproximateFuncs &= Flags.ApproximateFuncs;
     AllowReassociation &= Flags.AllowReassociation;

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 574a80fcc346..0763a5e11420 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -967,10 +967,6 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
   if (N0.getOpcode() != Opc)
     return SDValue();
 
-  // Don't reassociate reductions.
-  if (N0->getFlags().hasVectorReduction())
-    return SDValue();
-
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
     if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
       // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
@@ -995,9 +991,6 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
 SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
                                     SDValue N1, SDNodeFlags Flags) {
   assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");
-  // Don't reassociate reductions.
-  if (Flags.hasVectorReduction())
-    return SDValue();
 
   // Floating-point reassociation is not allowed without loose FP math.
   if (N0.getValueType().isFloatingPoint() ||

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1ad86208ed4e..d472ef948975 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2998,133 +2998,6 @@ void SelectionDAGBuilder::visitFSub(const User &I) {
   visitBinary(I, ISD::FSUB);
 }
 
-/// Checks if the given instruction performs a vector reduction, in which case
-/// we have the freedom to alter the elements in the result as long as the
-/// reduction of them stays unchanged.
-static bool isVectorReductionOp(const User *I) {
-  const Instruction *Inst = dyn_cast<Instruction>(I);
-  if (!Inst || !Inst->getType()->isVectorTy())
-    return false;
-
-  auto OpCode = Inst->getOpcode();
-  switch (OpCode) {
-  case Instruction::Add:
-  case Instruction::Mul:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-    break;
-  case Instruction::FAdd:
-  case Instruction::FMul:
-    if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
-      if (FPOp->getFastMathFlags().isFast())
-        break;
-    LLVM_FALLTHROUGH;
-  default:
-    return false;
-  }
-
-  unsigned ElemNum = Inst->getType()->getVectorNumElements();
-  // Ensure the reduction size is a power of 2.
-  if (!isPowerOf2_32(ElemNum))
-    return false;
-
-  unsigned ElemNumToReduce = ElemNum;
-
-  // Do DFS search on the def-use chain from the given instruction. We only
-  // allow four kinds of operations during the search until we reach the
-  // instruction that extracts the first element from the vector:
-  //
-  //   1. The reduction operation of the same opcode as the given instruction.
-  //
-  //   2. PHI node.
-  //
-  //   3. ShuffleVector instruction together with a reduction operation that
-  //      does a partial reduction.
-  //
-  //   4. ExtractElement that extracts the first element from the vector, and we
-  //      stop searching the def-use chain here.
-  //
-  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
-  // from 1-3 to the stack to continue the DFS. The given instruction is not
-  // a reduction operation if we meet any other instructions other than those
-  // listed above.
-
-  SmallVector<const User *, 16> UsersToVisit{Inst};
-  SmallPtrSet<const User *, 16> Visited;
-  bool ReduxExtracted = false;
-
-  while (!UsersToVisit.empty()) {
-    auto User = UsersToVisit.back();
-    UsersToVisit.pop_back();
-    if (!Visited.insert(User).second)
-      continue;
-
-    for (const auto *U : User->users()) {
-      auto Inst = dyn_cast<Instruction>(U);
-      if (!Inst)
-        return false;
-
-      if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
-        if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
-          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
-            return false;
-        UsersToVisit.push_back(U);
-      } else if (const ShuffleVectorInst *ShufInst =
-                     dyn_cast<ShuffleVectorInst>(U)) {
-        // Detect the following pattern: A ShuffleVector instruction together
-        // with a reduction that do partial reduction on the first and second
-        // ElemNumToReduce / 2 elements, and store the result in
-        // ElemNumToReduce / 2 elements in another vector.
-
-        unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
-        if (ResultElements < ElemNum)
-          return false;
-
-        if (ElemNumToReduce == 1)
-          return false;
-        if (!isa<UndefValue>(U->getOperand(1)))
-          return false;
-        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
-          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
-            return false;
-        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
-          if (ShufInst->getMaskValue(i) != -1)
-            return false;
-
-        // There is only one user of this ShuffleVector instruction, which
-        // must be a reduction operation.
-        if (!U->hasOneUse())
-          return false;
-
-        auto U2 = dyn_cast<Instruction>(*U->user_begin());
-        if (!U2 || U2->getOpcode() != OpCode)
-          return false;
-
-        // Check operands of the reduction operation.
-        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
-            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
-          UsersToVisit.push_back(U2);
-          ElemNumToReduce /= 2;
-        } else
-          return false;
-      } else if (isa<ExtractElementInst>(U)) {
-        // At this moment we should have reduced all elements in the vector.
-        if (ElemNumToReduce != 1)
-          return false;
-
-        const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
-        if (!Val || !Val->isZero())
-          return false;
-
-        ReduxExtracted = true;
-      } else
-        return false;
-    }
-  }
-  return ReduxExtracted;
-}
-
 void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
   SDNodeFlags Flags;
 
@@ -3143,17 +3016,6 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
   if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) {
     Flags.setExact(ExactOp->isExact());
   }
-  if (isVectorReductionOp(&I)) {
-    Flags.setVectorReduction(true);
-    LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
-
-    // If no flags are set we will propagate the incoming flags, if any flags
-    // are set, we will intersect them with the incoming flag and so we need to
-    // copy the FMF flags here.
-    if (auto *FPOp = dyn_cast<FPMathOperator>(&I)) {
-      Flags.copyFMF(*FPOp);
-    }
-  }
 
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index aca462f56674..f81d18c6ad52 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -553,9 +553,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
   if (getFlags().hasAllowReassociation())
     OS << " reassoc";
 
-  if (getFlags().hasVectorReduction())
-    OS << " vector-reduction";
-
   if (getFlags().hasNoFPExcept())
     OS << " nofpexcept";
 

diff  --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 3f0d68c0c788..1542fc1514ea 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -56,6 +56,7 @@ set(sources
   X86MacroFusion.cpp
   X86OptimizeLEAs.cpp
   X86PadShortFunction.cpp
+  X86PartialReduction.cpp
   X86RegisterBankInfo.cpp
   X86RegisterInfo.cpp
   X86RetpolineThunks.cpp

diff  --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index c81b349ecedd..8c0a13ccdc2c 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -133,6 +133,11 @@ FunctionPass *createX86InsertPrefetchPass();
 /// fp exceptions when strict-fp enabled.
 FunctionPass *createX86InsertX87waitPass();
 
+/// This pass optimizes arithmetic based on knowledge that is only used by
+/// a reduction sequence and is therefore safe to reassociate in interesting
+/// ways.
+FunctionPass *createX86PartialReductionPass();
+
 InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   X86Subtarget &,
                                                   X86RegisterBankInfo &);
@@ -154,6 +159,7 @@ void initializeX86ExecutionDomainFixPass(PassRegistry &);
 void initializeX86ExpandPseudoPass(PassRegistry &);
 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
 void initializeX86OptimizeLEAPassPass(PassRegistry &);
+void initializeX86PartialReductionPass(PassRegistry &);
 void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
 
 namespace X86AS {

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e40055fe37b5..f91d7ff9dc49 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -45902,131 +45902,6 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
                      DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
 }
 
-static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasSSE2())
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-
-  // If the vector size is less than 128, or greater than the supported RegSize,
-  // do not use PMADD.
-  if (!VT.isVector() || VT.getVectorNumElements() < 8)
-    return SDValue();
-
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-
-  auto UsePMADDWD = [&](SDValue Op) {
-    ShrinkMode Mode;
-    return Op.getOpcode() == ISD::MUL &&
-           canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
-           Mode != ShrinkMode::MULU16 &&
-           (!Subtarget.hasSSE41() ||
-            (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
-             Op->isOnlyUserOf(Op.getOperand(1).getNode())));
-  };
-
-  SDValue MulOp, OtherOp;
-  if (UsePMADDWD(Op0)) {
-    MulOp = Op0;
-    OtherOp = Op1;
-  } else if (UsePMADDWD(Op1)) {
-    MulOp = Op1;
-    OtherOp = Op0;
-  } else
-   return SDValue();
-
-  SDLoc DL(N);
-  EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
-                                   VT.getVectorNumElements());
-  EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
-                                VT.getVectorNumElements() / 2);
-
-  // Shrink the operands of mul.
-  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
-  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
-
-  // Madd vector size is half of the original vector size
-  auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                           ArrayRef<SDValue> Ops) {
-    MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
-    return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
-  };
-  SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
-                                  PMADDWDBuilder);
-  // Fill the rest of the output with 0
-  SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
-  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
-
-  // Preserve the reduction flag on the ADD. We may need to revisit for the
-  // other operand.
-  SDNodeFlags Flags;
-  Flags.setVectorReduction(true);
-  return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
-}
-
-static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasSSE2())
-    return SDValue();
-
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  // TODO: There's nothing special about i32, any integer type above i16 should
-  // work just as well.
-  if (!VT.isVector() || !isPowerOf2_32(VT.getVectorNumElements()) ||
-      VT.getVectorElementType() != MVT::i32)
-    return SDValue();
-
-  // We know N is a reduction add. To match SAD, we need one of the operands to
-  // be an ABS.
-  SDValue AbsOp = N->getOperand(0);
-  SDValue OtherOp = N->getOperand(1);
-  if (AbsOp.getOpcode() != ISD::ABS)
-    std::swap(AbsOp, OtherOp);
-  if (AbsOp.getOpcode() != ISD::ABS)
-    return SDValue();
-
-  // Check whether we have an abs-
diff  pattern feeding into the select.
-  SDValue SadOp0, SadOp1;
-  if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
-    return SDValue();
-
-  // SAD pattern detected. Now build a SAD instruction and an addition for
-  // reduction. Note that the number of elements of the result of SAD is less
-  // than the number of elements of its input. Therefore, we could only update
-  // part of elements in the reduction vector.
-  SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
-
-  // The output of PSADBW is a vector of i64.
-  // We need to turn the vector of i64 into a vector of i32.
-  // If the reduction vector is at least as wide as the psadbw result, just
-  // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
-  // the PSADBW will be zero.
-  MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
-  Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
-
-  if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
-    // Fill the upper elements with zero to match the add width.
-    assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");
-    unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
-    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
-    Ops[0] = Sad;
-    Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
-  } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
-    Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
-                      DAG.getIntPtrConstant(0, DL));
-  }
-
-  // Preserve the reduction flag on the ADD. We may need to revisit for the
-  // other operand.
-  SDNodeFlags Flags;
-  Flags.setVectorReduction(true);
-  return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
-}
-
 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
                             const SDLoc &DL, EVT VT,
                             const X86Subtarget &Subtarget) {
@@ -46116,30 +45991,25 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
       Mode == ShrinkMode::MULU16)
     return SDValue();
 
+  EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                 VT.getVectorNumElements() * 2);
+  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+
   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                          ArrayRef<SDValue> Ops) {
-    // Shrink by adding truncate nodes and let DAGCombine fold with the
-    // sources.
     EVT InVT = Ops[0].getValueType();
-    assert(InVT.getScalarType() == MVT::i32 &&
-           "Unexpected scalar element type");
     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                  InVT.getVectorNumElements() / 2);
-    EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
-                                   InVT.getVectorNumElements());
-    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
-                       DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
-                       DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
+    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
   };
-  return SplitOpsAndApply(DAG, Subtarget, DL, VT,
-                          { Mul.getOperand(0), Mul.getOperand(1) },
-                          PMADDBuilder);
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
 }
 
 // Attempt to turn this pattern into PMADDWD.
-// (mul (add (sext (build_vector)), (sext (build_vector))),
-//      (add (sext (build_vector)), (sext (build_vector)))
+// (add (mul (sext (build_vector)), (sext (build_vector))),
+//      (mul (sext (build_vector)), (sext (build_vector)))
 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
                               const SDLoc &DL, EVT VT,
                               const X86Subtarget &Subtarget) {
@@ -46261,13 +46131,6 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
-  const SDNodeFlags Flags = N->getFlags();
-  if (Flags.hasVectorReduction()) {
-    if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
-      return Sad;
-    if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
-      return MAdd;
-  }
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);

diff  --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
new file mode 100644
index 000000000000..4cd231de3440
--- /dev/null
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -0,0 +1,460 @@
+//===-- X86PartialReduction.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for add instructions used by a horizontal reduction to see
+// if we might be able to use pmaddwd or psadbw. Some cases of this require
+// cross basic block knowledge and can't be done in SelectionDAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-partial-reduction"
+
+namespace {
+
+class X86PartialReduction : public FunctionPass {
+  const DataLayout *DL;
+  const X86Subtarget *ST;
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  X86PartialReduction() : FunctionPass(ID) { }
+
+  bool runOnFunction(Function &Fn) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override {
+    return "X86 Partial Reduction";
+  }
+
+private:
+  bool tryMAddPattern(BinaryOperator *BO);
+  bool tryMAddReplacement(Value *Op, BinaryOperator *Add);
+
+  bool trySADPattern(BinaryOperator *BO);
+  bool trySADReplacement(Value *Op, BinaryOperator *Add);
+};
+}
+
+FunctionPass *llvm::createX86PartialReductionPass() {
+  return new X86PartialReduction();
+}
+
+char X86PartialReduction::ID = 0;
+
+INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
+                "X86 Partial Reduction", false, false)
+
+static bool isVectorReductionOp(const BinaryOperator &BO) {
+  if (!BO.getType()->isVectorTy())
+    return false;
+
+  unsigned Opcode = BO.getOpcode();
+
+  switch (Opcode) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    break;
+  case Instruction::FAdd:
+  case Instruction::FMul:
+    if (auto *FPOp = dyn_cast<FPMathOperator>(&BO))
+      if (FPOp->getFastMathFlags().isFast())
+        break;
+    LLVM_FALLTHROUGH;
+  default:
+    return false;
+  }
+
+  unsigned ElemNum = BO.getType()->getVectorNumElements();
+  // Ensure the reduction size is a power of 2.
+  if (!isPowerOf2_32(ElemNum))
+    return false;
+
+  unsigned ElemNumToReduce = ElemNum;
+
+  // Do DFS search on the def-use chain from the given instruction. We only
+  // allow four kinds of operations during the search until we reach the
+  // instruction that extracts the first element from the vector:
+  //
+  //   1. The reduction operation of the same opcode as the given instruction.
+  //
+  //   2. PHI node.
+  //
+  //   3. ShuffleVector instruction together with a reduction operation that
+  //      does a partial reduction.
+  //
+  //   4. ExtractElement that extracts the first element from the vector, and we
+  //      stop searching the def-use chain here.
+  //
+  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
+  // from 1-3 to the stack to continue the DFS. The given instruction is not
+  // a reduction operation if we meet any other instructions other than those
+  // listed above.
+
+  SmallVector<const User *, 16> UsersToVisit{&BO};
+  SmallPtrSet<const User *, 16> Visited;
+  bool ReduxExtracted = false;
+
+  while (!UsersToVisit.empty()) {
+    auto User = UsersToVisit.back();
+    UsersToVisit.pop_back();
+    if (!Visited.insert(User).second)
+      continue;
+
+    for (const auto *U : User->users()) {
+      auto *Inst = dyn_cast<Instruction>(U);
+      if (!Inst)
+        return false;
+
+      if (Inst->getOpcode() == Opcode || isa<PHINode>(U)) {
+        if (auto *FPOp = dyn_cast<FPMathOperator>(Inst))
+          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
+            return false;
+        UsersToVisit.push_back(U);
+      } else if (auto *ShufInst = dyn_cast<ShuffleVectorInst>(U)) {
+        // Detect the following pattern: A ShuffleVector instruction together
+        // with a reduction that do partial reduction on the first and second
+        // ElemNumToReduce / 2 elements, and store the result in
+        // ElemNumToReduce / 2 elements in another vector.
+
+        unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
+        if (ResultElements < ElemNum)
+          return false;
+
+        if (ElemNumToReduce == 1)
+          return false;
+        if (!isa<UndefValue>(U->getOperand(1)))
+          return false;
+        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
+          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
+            return false;
+        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
+          if (ShufInst->getMaskValue(i) != -1)
+            return false;
+
+        // There is only one user of this ShuffleVector instruction, which
+        // must be a reduction operation.
+        if (!U->hasOneUse())
+          return false;
+
+        auto *U2 = dyn_cast<BinaryOperator>(*U->user_begin());
+        if (!U2 || U2->getOpcode() != Opcode)
+          return false;
+
+        // Check operands of the reduction operation.
+        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
+            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
+          UsersToVisit.push_back(U2);
+          ElemNumToReduce /= 2;
+        } else
+          return false;
+      } else if (isa<ExtractElementInst>(U)) {
+        // At this moment we should have reduced all elements in the vector.
+        if (ElemNumToReduce != 1)
+          return false;
+
+        auto *Val = dyn_cast<ConstantInt>(U->getOperand(1));
+        if (!Val || !Val->isZero())
+          return false;
+
+        ReduxExtracted = true;
+      } else
+        return false;
+    }
+  }
+  return ReduxExtracted;
+}
+
+bool X86PartialReduction::tryMAddReplacement(Value *Op, BinaryOperator *Add) {
+  BasicBlock *BB = Add->getParent();
+
+  auto *BO = dyn_cast<BinaryOperator>(Op);
+  if (!BO || BO->getOpcode() != Instruction::Mul || !BO->hasOneUse() ||
+      BO->getParent() != BB)
+    return false;
+
+  Value *LHS = BO->getOperand(0);
+  Value *RHS = BO->getOperand(1);
+
+  // LHS and RHS should be only used once or if they are the same then only
+  // used twice. Only check this when SSE4.1 is enabled and we have zext/sext
+  // instructions, otherwise we use punpck to emulate zero extend in stages. The
+  // trunc/ we need to do likely won't introduce new instructions in that case.
+  if (ST->hasSSE41()) {
+    if (LHS == RHS) {
+      if (!isa<Constant>(LHS) && !LHS->hasNUses(2))
+        return false;
+    } else {
+      if (!isa<Constant>(LHS) && !LHS->hasOneUse())
+        return false;
+      if (!isa<Constant>(RHS) && !RHS->hasOneUse())
+        return false;
+    }
+  }
+
+  auto canShrinkOp = [&](Value *Op) {
+    if (isa<Constant>(Op) && ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16)
+      return true;
+    if (auto *Cast = dyn_cast<CastInst>(Op)) {
+      if (Cast->getParent() == BB &&
+          (Cast->getOpcode() == Instruction::SExt ||
+           Cast->getOpcode() == Instruction::ZExt) &&
+          ComputeNumSignBits(Op, *DL, 0, nullptr, BO) > 16)
+        return true;
+    }
+
+    return false;
+  };
+
+  // Both Ops need to be shrinkable.
+  if (!canShrinkOp(LHS) && !canShrinkOp(RHS))
+    return false;
+
+  IRBuilder<> Builder(Add);
+
+  Type *MulTy = Op->getType();
+  unsigned NumElts = MulTy->getVectorNumElements();
+
+  // Extract even elements and odd elements and add them together. This will
+  // be pattern matched by SelectionDAG to pmaddwd. This instruction will be
+  // half the original width.
+  SmallVector<uint32_t, 16> EvenMask(NumElts / 2);
+  SmallVector<uint32_t, 16> OddMask(NumElts / 2);
+  for (int i = 0, e = NumElts / 2; i != e; ++i) {
+    EvenMask[i] = i * 2;
+    OddMask[i] = i * 2 + 1;
+  }
+  Value *EvenElts = Builder.CreateShuffleVector(BO, BO, EvenMask);
+  Value *OddElts = Builder.CreateShuffleVector(BO, BO, OddMask);
+  Value *MAdd = Builder.CreateAdd(EvenElts, OddElts);
+
+  // Concatenate zeroes to extend back to the original type.
+  SmallVector<uint32_t, 32> ConcatMask(NumElts);
+  std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+  Value *Zero = Constant::getNullValue(MAdd->getType());
+  Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask);
+
+  // Replaces the use of mul in the original Add with the pmaddwd and zeroes.
+  Add->replaceUsesOfWith(BO, Concat);
+  Add->setHasNoSignedWrap(false);
+  Add->setHasNoUnsignedWrap(false);
+
+  return true;
+}
+
+// Try to replace operans of this add with pmaddwd patterns.
+bool X86PartialReduction::tryMAddPattern(BinaryOperator *BO) {
+  if (!ST->hasSSE2())
+    return false;
+
+  // Need at least 8 elements.
+  if (BO->getType()->getVectorNumElements() < 8)
+    return false;
+
+  // Element type should be i32.
+  if (!BO->getType()->getVectorElementType()->isIntegerTy(32))
+    return false;
+
+  bool Changed = false;
+  Changed |= tryMAddReplacement(BO->getOperand(0), BO);
+  Changed |= tryMAddReplacement(BO->getOperand(1), BO);
+  return Changed;
+}
+
+bool X86PartialReduction::trySADReplacement(Value *Op, BinaryOperator *Add) {
+  // Operand should be a select.
+  auto *SI = dyn_cast<SelectInst>(Op);
+  if (!SI)
+    return false;
+
+  // Select needs to implement absolute value.
+  Value *LHS, *RHS;
+  auto SPR = matchSelectPattern(SI, LHS, RHS);
+  if (SPR.Flavor != SPF_ABS)
+    return false;
+
+  // Need a subtract of two values.
+  auto *Sub = dyn_cast<BinaryOperator>(LHS);
+  if (!Sub || Sub->getOpcode() != Instruction::Sub)
+    return false;
+
+  // Look for zero extend from i8.
+  auto getZeroExtendedVal = [](Value *Op) -> Value * {
+    if (auto *ZExt = dyn_cast<ZExtInst>(Op))
+      if (ZExt->getOperand(0)->getType()->getVectorElementType()->isIntegerTy(8))
+        return ZExt->getOperand(0);
+
+    return nullptr;
+  };
+
+  // Both operands of the subtract should be extends from vXi8.
+  Value *Op0 = getZeroExtendedVal(Sub->getOperand(0));
+  Value *Op1 = getZeroExtendedVal(Sub->getOperand(1));
+  if (!Op0 || !Op1)
+    return false;
+
+  IRBuilder<> Builder(Add);
+
+  Type *OpTy = Op->getType();
+  unsigned NumElts = OpTy->getVectorNumElements();
+
+  unsigned IntrinsicNumElts;
+  Intrinsic::ID IID;
+  if (ST->hasBWI() && NumElts >= 64) {
+    IID = Intrinsic::x86_avx512_psad_bw_512;
+    IntrinsicNumElts = 64;
+  } else if (ST->hasAVX2() && NumElts >= 32) {
+    IID = Intrinsic::x86_avx2_psad_bw;
+    IntrinsicNumElts = 32;
+  } else {
+    IID = Intrinsic::x86_sse2_psad_bw;
+    IntrinsicNumElts = 16;
+  }
+
+  Function *PSADBWFn = Intrinsic::getDeclaration(Add->getModule(), IID);
+
+  if (NumElts < 16) {
+    // Pad input with zeroes.
+    SmallVector<uint32_t, 32> ConcatMask(16);
+    for (unsigned i = 0; i != NumElts; ++i)
+      ConcatMask[i] = i;
+    for (unsigned i = NumElts; i != 16; ++i)
+      ConcatMask[i] = (i % NumElts) + NumElts;
+
+    Value *Zero = Constant::getNullValue(Op0->getType());
+    Op0 = Builder.CreateShuffleVector(Op0, Zero, ConcatMask);
+    Op1 = Builder.CreateShuffleVector(Op1, Zero, ConcatMask);
+    NumElts = 16;
+  }
+
+  // Intrinsics produce vXi64 and need to be casted to vXi32.
+  Type *I32Ty = VectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4);
+
+  assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!");
+  unsigned NumSplits = NumElts / IntrinsicNumElts;
+
+  // First collect the pieces we need.
+  SmallVector<Value *, 4> Ops(NumSplits);
+  for (unsigned i = 0; i != NumSplits; ++i) {
+    SmallVector<uint32_t, 64> ExtractMask(IntrinsicNumElts);
+    std::iota(ExtractMask.begin(), ExtractMask.end(), i * IntrinsicNumElts);
+    Value *ExtractOp0 = Builder.CreateShuffleVector(Op0, Op0, ExtractMask);
+    Value *ExtractOp1 = Builder.CreateShuffleVector(Op1, Op0, ExtractMask);
+    Ops[i] = Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1});
+    Ops[i] = Builder.CreateBitCast(Ops[i], I32Ty);
+  }
+
+  assert(isPowerOf2_32(NumSplits) && "Expected power of 2 splits");
+  unsigned Stages = Log2_32(NumSplits);
+  for (unsigned s = Stages; s > 0; --s) {
+    unsigned NumConcatElts = Ops[0]->getType()->getVectorNumElements() * 2;
+    for (unsigned i = 0; i != 1 << (s - 1); ++i) {
+      SmallVector<uint32_t, 64> ConcatMask(NumConcatElts);
+      std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+      Ops[i] = Builder.CreateShuffleVector(Ops[i*2], Ops[i*2+1], ConcatMask);
+    }
+  }
+
+  // At this point the final value should be in Ops[0]. Now we need to adjust
+  // it to the final original type.
+  NumElts = OpTy->getVectorNumElements();
+  if (NumElts == 2) {
+    // Extract down to 2 elements.
+    Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], {0, 1});
+  } else if (NumElts >= 8) {
+    SmallVector<uint32_t, 32> ConcatMask(NumElts);
+    unsigned SubElts = Ops[0]->getType()->getVectorNumElements();
+    for (unsigned i = 0; i != SubElts; ++i)
+      ConcatMask[i] = i;
+    for (unsigned i = SubElts; i != NumElts; ++i)
+      ConcatMask[i] = (i % SubElts) + SubElts;
+
+    Value *Zero = Constant::getNullValue(Ops[0]->getType());
+    Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
+  }
+
+  // Replaces the uses of Op in Add with the new sequence.
+  Add->replaceUsesOfWith(Op, Ops[0]);
+  Add->setHasNoSignedWrap(false);
+  Add->setHasNoUnsignedWrap(false);
+
+  return false;
+}
+
+bool X86PartialReduction::trySADPattern(BinaryOperator *BO) {
+  if (!ST->hasSSE2())
+    return false;
+
+  // TODO: There's nothing special about i32, any integer type above i16 should
+  // work just as well.
+  if (!BO->getType()->getVectorElementType()->isIntegerTy(32))
+    return false;
+
+  bool Changed = false;
+  Changed |= trySADReplacement(BO->getOperand(0), BO);
+  Changed |= trySADReplacement(BO->getOperand(1), BO);
+  return Changed;
+}
+
+bool X86PartialReduction::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<X86TargetMachine>();
+  ST = TM.getSubtargetImpl(F);
+
+  DL = &F.getParent()->getDataLayout();
+
+  bool MadeChange = false;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      auto *BO = dyn_cast<BinaryOperator>(&I);
+      if (!BO)
+        continue;
+
+      if (!isVectorReductionOp(*BO))
+        continue;
+
+      if (BO->getOpcode() == Instruction::Add) {
+        if (tryMAddPattern(BO)) {
+          MadeChange = true;
+          continue;
+        }
+        if (trySADPattern(BO)) {
+          MadeChange = true;
+          continue;
+        }
+      }
+    }
+  }
+
+  return MadeChange;
+}

diff  --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index dd6b67865ac0..d80c82c584bd 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -84,6 +84,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
   initializeX86FlagsCopyLoweringPassPass(PR);
   initializeX86CondBrFoldingPassPass(PR);
   initializeX86OptimizeLEAPassPass(PR);
+  initializeX86PartialReductionPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -400,8 +401,10 @@ void X86PassConfig::addIRPasses() {
 
   TargetPassConfig::addIRPasses();
 
-  if (TM->getOptLevel() != CodeGenOpt::None)
+  if (TM->getOptLevel() != CodeGenOpt::None) {
     addPass(createInterleavedAccessPass());
+    addPass(createX86PartialReductionPass());
+  }
 
   // Add passes that handle indirect branch removal and insertion of a retpoline
   // thunk. These will be a no-op unless a function subtarget has the retpoline

diff  --git a/llvm/test/CodeGen/Generic/vector-redux.ll b/llvm/test/CodeGen/Generic/vector-redux.ll
deleted file mode 100644
index 8efdbf85b8c0..000000000000
--- a/llvm/test/CodeGen/Generic/vector-redux.ll
+++ /dev/null
@@ -1,237 +0,0 @@
-; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
-; REQUIRES: asserts
-
- at a = global [1024 x i32] zeroinitializer, align 16
-
-define i32 @reduce_add() {
-; CHECK-LABEL: reduce_add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-; CHECK:       Detected a reduction operation: {{.*}} add
-
-min.iters.checked:
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %min.iters.checked ], [ %index.next.4, %vector.body ]
-  %vec.phi = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %28, %vector.body ]
-  %vec.phi4 = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %29, %vector.body ]
-  %0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index
-  %1 = bitcast i32* %0 to <4 x i32>*
-  %wide.load = load <4 x i32>, <4 x i32>* %1, align 16
-  %2 = getelementptr i32, i32* %0, i64 4
-  %3 = bitcast i32* %2 to <4 x i32>*
-  %wide.load5 = load <4 x i32>, <4 x i32>* %3, align 16
-  %4 = add nsw <4 x i32> %wide.load, %vec.phi
-  %5 = add nsw <4 x i32> %wide.load5, %vec.phi4
-  %index.next = add nuw nsw i64 %index, 8
-  %6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next
-  %7 = bitcast i32* %6 to <4 x i32>*
-  %wide.load.1 = load <4 x i32>, <4 x i32>* %7, align 16
-  %8 = getelementptr i32, i32* %6, i64 4
-  %9 = bitcast i32* %8 to <4 x i32>*
-  %wide.load5.1 = load <4 x i32>, <4 x i32>* %9, align 16
-  %10 = add nsw <4 x i32> %wide.load.1, %4
-  %11 = add nsw <4 x i32> %wide.load5.1, %5
-  %index.next.1 = add nsw i64 %index, 16
-  %12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.1
-  %13 = bitcast i32* %12 to <4 x i32>*
-  %wide.load.2 = load <4 x i32>, <4 x i32>* %13, align 16
-  %14 = getelementptr i32, i32* %12, i64 4
-  %15 = bitcast i32* %14 to <4 x i32>*
-  %wide.load5.2 = load <4 x i32>, <4 x i32>* %15, align 16
-  %16 = add nsw <4 x i32> %wide.load.2, %10
-  %17 = add nsw <4 x i32> %wide.load5.2, %11
-  %index.next.2 = add nsw i64 %index, 24
-  %18 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.2
-  %19 = bitcast i32* %18 to <4 x i32>*
-  %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 16
-  %20 = getelementptr i32, i32* %18, i64 4
-  %21 = bitcast i32* %20 to <4 x i32>*
-  %wide.load5.3 = load <4 x i32>, <4 x i32>* %21, align 16
-  %22 = add nsw <4 x i32> %wide.load.3, %16
-  %23 = add nsw <4 x i32> %wide.load5.3, %17
-  %index.next.3 = add nsw i64 %index, 32
-  %24 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.3
-  %25 = bitcast i32* %24 to <4 x i32>*
-  %wide.load.4 = load <4 x i32>, <4 x i32>* %25, align 16
-  %26 = getelementptr i32, i32* %24, i64 4
-  %27 = bitcast i32* %26 to <4 x i32>*
-  %wide.load5.4 = load <4 x i32>, <4 x i32>* %27, align 16
-  %28 = add nsw <4 x i32> %wide.load.4, %22
-  %29 = add nsw <4 x i32> %wide.load5.4, %23
-  %index.next.4 = add nsw i64 %index, 40
-  %30 = icmp eq i64 %index.next.4, 1000
-  br i1 %30, label %middle.block, label %vector.body
-
-middle.block:
-  %.lcssa10 = phi <4 x i32> [ %29, %vector.body ]
-  %.lcssa = phi <4 x i32> [ %28, %vector.body ]
-  %bin.rdx = add <4 x i32> %.lcssa10, %.lcssa
-  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %bin.rdx6 = add <4 x i32> %bin.rdx, %rdx.shuf
-  %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx6, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %bin.rdx8 = add <4 x i32> %bin.rdx6, %rdx.shuf7
-  %31 = extractelement <4 x i32> %bin.rdx8, i32 0
-  ret i32 %31
-}
-
-define i32 @reduce_and() {
-; CHECK-LABEL: reduce_and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-; CHECK:       Detected a reduction operation: {{.*}} and
-
-entry:
-  br label %vector.body
-
-vector.body:
-  %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ -4096, %entry ]
-  %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %6, %vector.body ]
-  %vec.phi9 = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %7, %vector.body ]
-  %uglygep33 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep3334 = bitcast i8* %uglygep33 to <4 x i32>*
-  %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 256
-  %wide.load = load <4 x i32>, <4 x i32>* %scevgep35, align 16
-  %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 257
-  %wide.load10 = load <4 x i32>, <4 x i32>* %scevgep36, align 16
-  %0 = and <4 x i32> %wide.load, %vec.phi
-  %1 = and <4 x i32> %wide.load10, %vec.phi9
-  %uglygep30 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep3031 = bitcast i8* %uglygep30 to <4 x i32>*
-  %scevgep32 = getelementptr <4 x i32>, <4 x i32>* %uglygep3031, i64 258
-  %wide.load.1 = load <4 x i32>, <4 x i32>* %scevgep32, align 16
-  %uglygep27 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep2728 = bitcast i8* %uglygep27 to <4 x i32>*
-  %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %uglygep2728, i64 259
-  %wide.load10.1 = load <4 x i32>, <4 x i32>* %scevgep29, align 16
-  %2 = and <4 x i32> %wide.load.1, %0
-  %3 = and <4 x i32> %wide.load10.1, %1
-  %uglygep24 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep2425 = bitcast i8* %uglygep24 to <4 x i32>*
-  %scevgep26 = getelementptr <4 x i32>, <4 x i32>* %uglygep2425, i64 260
-  %wide.load.2 = load <4 x i32>, <4 x i32>* %scevgep26, align 16
-  %uglygep21 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep2122 = bitcast i8* %uglygep21 to <4 x i32>*
-  %scevgep23 = getelementptr <4 x i32>, <4 x i32>* %uglygep2122, i64 261
-  %wide.load10.2 = load <4 x i32>, <4 x i32>* %scevgep23, align 16
-  %4 = and <4 x i32> %wide.load.2, %2
-  %5 = and <4 x i32> %wide.load10.2, %3
-  %uglygep18 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep1819 = bitcast i8* %uglygep18 to <4 x i32>*
-  %scevgep20 = getelementptr <4 x i32>, <4 x i32>* %uglygep1819, i64 262
-  %wide.load.3 = load <4 x i32>, <4 x i32>* %scevgep20, align 16
-  %uglygep = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
-  %uglygep17 = bitcast i8* %uglygep to <4 x i32>*
-  %scevgep = getelementptr <4 x i32>, <4 x i32>* %uglygep17, i64 263
-  %wide.load10.3 = load <4 x i32>, <4 x i32>* %scevgep, align 16
-  %6 = and <4 x i32> %wide.load.3, %4
-  %7 = and <4 x i32> %wide.load10.3, %5
-  %lsr.iv.next = add nsw i64 %lsr.iv, 128
-  %8 = icmp eq i64 %lsr.iv.next, 0
-  br i1 %8, label %middle.block, label %vector.body
-
-middle.block:
-  %bin.rdx = and <4 x i32> %7, %6
-  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %bin.rdx11 = and <4 x i32> %bin.rdx, %rdx.shuf
-  %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %bin.rdx13 = and <4 x i32> %bin.rdx11, %rdx.shuf12
-  %9 = extractelement <4 x i32> %bin.rdx13, i32 0
-  ret i32 %9
-}
-
-define float @reduce_add_float(float* nocapture readonly %a) {
-; CHECK-LABEL: reduce_add_float
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-; CHECK:       Detected a reduction operation: {{.*}} fadd fast
-;
-entry:
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %entry ], [ %index.next.4, %vector.body ]
-  %vec.phi = phi <4 x float> [ zeroinitializer, %entry ], [ %28, %vector.body ]
-  %vec.phi9 = phi <4 x float> [ zeroinitializer, %entry ], [ %29, %vector.body ]
-  %0 = getelementptr inbounds float, float* %a, i64 %index
-  %1 = bitcast float* %0 to <4 x float>*
-  %wide.load = load <4 x float>, <4 x float>* %1, align 4
-  %2 = getelementptr float, float* %0, i64 4
-  %3 = bitcast float* %2 to <4 x float>*
-  %wide.load10 = load <4 x float>, <4 x float>* %3, align 4
-  %4 = fadd fast <4 x float> %wide.load, %vec.phi
-  %5 = fadd fast <4 x float> %wide.load10, %vec.phi9
-  %index.next = add nuw nsw i64 %index, 8
-  %6 = getelementptr inbounds float, float* %a, i64 %index.next
-  %7 = bitcast float* %6 to <4 x float>*
-  %wide.load.1 = load <4 x float>, <4 x float>* %7, align 4
-  %8 = getelementptr float, float* %6, i64 4
-  %9 = bitcast float* %8 to <4 x float>*
-  %wide.load10.1 = load <4 x float>, <4 x float>* %9, align 4
-  %10 = fadd fast <4 x float> %wide.load.1, %4
-  %11 = fadd fast <4 x float> %wide.load10.1, %5
-  %index.next.1 = add nsw i64 %index, 16
-  %12 = getelementptr inbounds float, float* %a, i64 %index.next.1
-  %13 = bitcast float* %12 to <4 x float>*
-  %wide.load.2 = load <4 x float>, <4 x float>* %13, align 4
-  %14 = getelementptr float, float* %12, i64 4
-  %15 = bitcast float* %14 to <4 x float>*
-  %wide.load10.2 = load <4 x float>, <4 x float>* %15, align 4
-  %16 = fadd fast <4 x float> %wide.load.2, %10
-  %17 = fadd fast <4 x float> %wide.load10.2, %11
-  %index.next.2 = add nsw i64 %index, 24
-  %18 = getelementptr inbounds float, float* %a, i64 %index.next.2
-  %19 = bitcast float* %18 to <4 x float>*
-  %wide.load.3 = load <4 x float>, <4 x float>* %19, align 4
-  %20 = getelementptr float, float* %18, i64 4
-  %21 = bitcast float* %20 to <4 x float>*
-  %wide.load10.3 = load <4 x float>, <4 x float>* %21, align 4
-  %22 = fadd fast <4 x float> %wide.load.3, %16
-  %23 = fadd fast <4 x float> %wide.load10.3, %17
-  %index.next.3 = add nsw i64 %index, 32
-  %24 = getelementptr inbounds float, float* %a, i64 %index.next.3
-  %25 = bitcast float* %24 to <4 x float>*
-  %wide.load.4 = load <4 x float>, <4 x float>* %25, align 4
-  %26 = getelementptr float, float* %24, i64 4
-  %27 = bitcast float* %26 to <4 x float>*
-  %wide.load10.4 = load <4 x float>, <4 x float>* %27, align 4
-  %28 = fadd fast <4 x float> %wide.load.4, %22
-  %29 = fadd fast <4 x float> %wide.load10.4, %23
-  %index.next.4 = add nsw i64 %index, 40
-  %30 = icmp eq i64 %index.next.4, 1000
-  br i1 %30, label %middle.block, label %vector.body
-
-middle.block:
-  %.lcssa15 = phi <4 x float> [ %29, %vector.body ]
-  %.lcssa = phi <4 x float> [ %28, %vector.body ]
-  %bin.rdx = fadd fast <4 x float> %.lcssa15, %.lcssa
-  %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-  %bin.rdx11 = fadd fast <4 x float> %bin.rdx, %rdx.shuf
-  %rdx.shuf12 = shufflevector <4 x float> %bin.rdx11, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %bin.rdx13 = fadd fast <4 x float> %bin.rdx11, %rdx.shuf12
-  %31 = extractelement <4 x float> %bin.rdx13, i32 0
-  ret float %31
-}

diff  --git a/llvm/test/CodeGen/X86/O3-pipeline.ll b/llvm/test/CodeGen/X86/O3-pipeline.ll
index c72676b24cfd..f22c70199d19 100644
--- a/llvm/test/CodeGen/X86/O3-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O3-pipeline.ll
@@ -51,6 +51,7 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Interleaved Access Pass
+; CHECK-NEXT:       X86 Partial Reduction
 ; CHECK-NEXT:       Expand indirectbr instructions
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Natural Loop Information

diff  --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index 43bb8ee004a2..cad6f61e5545 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -236,10 +236,10 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX1-NEXT:    vmovdqu (%rsi,%rcx,2), %xmm2
 ; AVX1-NEXT:    vmovdqu 16(%rsi,%rcx,2), %xmm3
+; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
 ; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $16, %rcx
@@ -407,16 +407,16 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX1-NEXT:    vmovdqu 16(%rsi,%rcx,2), %xmm4
 ; AVX1-NEXT:    vmovdqu 32(%rsi,%rcx,2), %xmm5
 ; AVX1-NEXT:    vmovdqu 48(%rsi,%rcx,2), %xmm6
+; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3
+; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
+; AVX1-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
 ; AVX1-NEXT:    vpmaddwd 48(%rdi,%rcx,2), %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
 ; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %xmm5, %xmm5
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT:    vpmaddwd 16(%rdi,%rcx,2), %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpmaddwd (%rdi,%rcx,2), %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $16, %rcx
@@ -453,10 +453,10 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX2-NEXT:    vmovdqu (%rsi,%rcx,2), %ymm3
 ; AVX2-NEXT:    vmovdqu 32(%rsi,%rcx,2), %ymm4
-; AVX2-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm4
-; AVX2-NEXT:    vpaddd %ymm2, %ymm4, %ymm2
 ; AVX2-NEXT:    vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3
+; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    addq $16, %rcx
 ; AVX2-NEXT:    cmpq %rcx, %rax
 ; AVX2-NEXT:    jne .LBB3_1
@@ -779,18 +779,18 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SSE2-NEXT:    movdqu (%rdi,%rcx), %xmm3
 ; SSE2-NEXT:    movdqu (%rsi,%rcx), %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
 ; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
 ; SSE2-NEXT:    psraw $8, %xmm6
 ; SSE2-NEXT:    pmaddwd %xmm5, %xmm6
-; SSE2-NEXT:    paddd %xmm6, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm4
 ; SSE2-NEXT:    pmaddwd %xmm3, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm2
+; SSE2-NEXT:    paddd %xmm4, %xmm1
 ; SSE2-NEXT:    addq $16, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB6_1
@@ -814,16 +814,16 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB6_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm2
-; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm3
-; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm4
-; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm2
+; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm3
 ; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $16, %rcx
 ; AVX1-NEXT:    cmpq %rcx, %rax
 ; AVX1-NEXT:    jne .LBB6_1
@@ -943,34 +943,34 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB7_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqu (%rdi,%rcx), %xmm10
-; SSE2-NEXT:    movdqu 16(%rdi,%rcx), %xmm7
-; SSE2-NEXT:    movdqu (%rsi,%rcx), %xmm9
-; SSE2-NEXT:    movdqu 16(%rsi,%rcx), %xmm0
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; SSE2-NEXT:    movdqu (%rdi,%rcx), %xmm7
+; SSE2-NEXT:    movdqu 16(%rdi,%rcx), %xmm10
+; SSE2-NEXT:    movdqu (%rsi,%rcx), %xmm0
+; SSE2-NEXT:    movdqu 16(%rsi,%rcx), %xmm9
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
 ; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
 ; SSE2-NEXT:    psraw $8, %xmm6
 ; SSE2-NEXT:    pmaddwd %xmm5, %xmm6
-; SSE2-NEXT:    paddd %xmm6, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
 ; SSE2-NEXT:    psraw $8, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; SSE2-NEXT:    psraw $8, %xmm0
 ; SSE2-NEXT:    pmaddwd %xmm5, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
+; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
 ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
 ; SSE2-NEXT:    psraw $8, %xmm5
 ; SSE2-NEXT:    pmaddwd %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
+; SSE2-NEXT:    paddd %xmm5, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
 ; SSE2-NEXT:    psraw $8, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
 ; SSE2-NEXT:    psraw $8, %xmm5
 ; SSE2-NEXT:    pmaddwd %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm2
+; SSE2-NEXT:    paddd %xmm5, %xmm3
 ; SSE2-NEXT:    addq $32, %rcx
 ; SSE2-NEXT:    cmpq %rcx, %rax
 ; SSE2-NEXT:    jne .LBB7_1
@@ -999,26 +999,26 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB7_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm3
-; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm4
-; AVX1-NEXT:    vpmovsxbw 16(%rdi,%rcx), %xmm5
-; AVX1-NEXT:    vpmovsxbw 24(%rdi,%rcx), %xmm6
-; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm7
+; AVX1-NEXT:    vpmovsxbw 24(%rdi,%rcx), %xmm3
+; AVX1-NEXT:    vpmovsxbw 16(%rdi,%rcx), %xmm4
+; AVX1-NEXT:    vpmovsxbw 8(%rdi,%rcx), %xmm5
+; AVX1-NEXT:    vpmovsxbw (%rdi,%rcx), %xmm6
+; AVX1-NEXT:    vpmovsxbw 24(%rsi,%rcx), %xmm7
 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm7, %xmm3
-; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm7
-; AVX1-NEXT:    vpmaddwd %xmm4, %xmm7, %xmm4
 ; AVX1-NEXT:    vpmovsxbw 16(%rsi,%rcx), %xmm7
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm7, %xmm4
+; AVX1-NEXT:    vpmovsxbw 8(%rsi,%rcx), %xmm7
 ; AVX1-NEXT:    vpmaddwd %xmm5, %xmm7, %xmm5
-; AVX1-NEXT:    vpmovsxbw 24(%rsi,%rcx), %xmm7
+; AVX1-NEXT:    vpmovsxbw (%rsi,%rcx), %xmm7
 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT:    vpaddd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $32, %rcx
 ; AVX1-NEXT:    cmpq %rcx, %rax
 ; AVX1-NEXT:    jne .LBB7_1
@@ -1051,14 +1051,14 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl
 ; AVX2-NEXT:    .p2align 4, 0x90
 ; AVX2-NEXT:  .LBB7_1: # %vector.body
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm3
-; AVX2-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm4
-; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm5
+; AVX2-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm3
+; AVX2-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
+; AVX2-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
 ; AVX2-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
-; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm3
-; AVX2-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
+; AVX2-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    addq $32, %rcx
 ; AVX2-NEXT:    cmpq %rcx, %rax
 ; AVX2-NEXT:    jne .LBB7_1
@@ -1913,9 +1913,9 @@ define <8 x i32> @pmaddwd_16(<16 x i16> %A, <16 x i16> %B) {
 ;
 ; AVX1-LABEL: pmaddwd_16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -1944,16 +1944,16 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
 ;
 ; AVX1-LABEL: pmaddwd_32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: pmaddwd_32:
@@ -1964,9 +1964,9 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
 ;
 ; AVX512F-LABEL: pmaddwd_32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -2126,9 +2126,9 @@ define <8 x i32> @jumbled_indices8(<16 x i16> %A, <16 x i16> %B) {
 ;
 ; AVX1-LABEL: jumbled_indices8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpmaddwd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -2157,16 +2157,16 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
 ;
 ; AVX1-LABEL: jumbled_indices16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vpmaddwd %xmm6, %xmm5, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpmaddwd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: jumbled_indices16:
@@ -2177,9 +2177,9 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
 ;
 ; AVX512F-LABEL: jumbled_indices16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT:    vpmaddwd %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpmaddwd %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
@@ -2221,26 +2221,26 @@ define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) {
 ;
 ; AVX1-LABEL: jumbled_indices32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm10
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm11
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm8, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm9, %xmm9
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm10, %xmm10
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm12
-; AVX1-NEXT:    vpmaddwd %xmm12, %xmm11, %xmm11
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT:    vpmaddwd %xmm8, %xmm9, %xmm8
 ; AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm8, %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm2, %ymm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT:    vpmaddwd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpmaddwd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm3, %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: jumbled_indices32:
@@ -2656,7 +2656,7 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
 ; AVX-NEXT:    vmovdqu (%rdx), %xmm1
 ; AVX-NEXT:    vpmaddwd (%rcx), %xmm1, %xmm1
 ; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
@@ -2698,14 +2698,14 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
 ; SSE2-NEXT:    movdqu (%r8), %xmm0
 ; SSE2-NEXT:    movdqu (%r9), %xmm3
 ; SSE2-NEXT:    pmaddwd %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    paddd %xmm1, %xmm3
 ; SSE2-NEXT:    movdqu (%rax), %xmm0
-; SSE2-NEXT:    movdqu (%r10), %xmm2
-; SSE2-NEXT:    pmaddwd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm3, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    movdqu (%r10), %xmm1
+; SSE2-NEXT:    pmaddwd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
@@ -2721,11 +2721,11 @@ define i32 @madd_quad_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>* %a
 ; AVX-NEXT:    vpmaddwd (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqu (%r8), %xmm2
 ; AVX-NEXT:    vpmaddwd (%r9), %xmm2, %xmm2
-; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqu (%rax), %xmm2
 ; AVX-NEXT:    vpmaddwd (%r10), %xmm2, %xmm2
-; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]

diff  --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 543da1e999c6..44ed30394a44 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -180,14 +180,14 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB8_1: # %vector.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm3
-; CHECK-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm4
-; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm5
+; CHECK-NEXT:    vpmovsxbw 16(%rdi,%rcx), %ymm3
+; CHECK-NEXT:    vpmovsxbw (%rdi,%rcx), %ymm4
+; CHECK-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm5
 ; CHECK-NEXT:    vpmaddwd %ymm3, %ymm5, %ymm3
-; CHECK-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
-; CHECK-NEXT:    vpmovsxbw 16(%rsi,%rcx), %ymm3
-; CHECK-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
 ; CHECK-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
+; CHECK-NEXT:    vpmovsxbw (%rsi,%rcx), %ymm3
+; CHECK-NEXT:    vpmaddwd %ymm4, %ymm3, %ymm3
+; CHECK-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
 ; CHECK-NEXT:    addq $32, %rcx
 ; CHECK-NEXT:    cmpq %rcx, %rax
 ; CHECK-NEXT:    jne .LBB8_1

diff  --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 64845c847dff..6a742068f701 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -158,12 +158,12 @@ define i32 @sad_32i8() nounwind {
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB1_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm3
-; SSE2-NEXT:    psadbw b+1040(%rax), %xmm3
-; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm3
 ; SSE2-NEXT:    psadbw b+1024(%rax), %xmm3
 ; SSE2-NEXT:    paddd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm3
+; SSE2-NEXT:    psadbw b+1040(%rax), %xmm3
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
@@ -188,14 +188,14 @@ define i32 @sad_32i8() nounwind {
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB1_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm2
-; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm3
-; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm3, %xmm3
-; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm2
+; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm3
+; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm3, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; AVX1-NEXT:    addq $4, %rax
 ; AVX1-NEXT:    jne .LBB1_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
@@ -320,15 +320,15 @@ define i32 @sad_avx64i8() nounwind {
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB2_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa a+1056(%rax), %xmm5
-; SSE2-NEXT:    psadbw b+1056(%rax), %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm2
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm5
-; SSE2-NEXT:    psadbw b+1040(%rax), %xmm5
-; SSE2-NEXT:    paddd %xmm5, %xmm3
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm5
 ; SSE2-NEXT:    psadbw b+1024(%rax), %xmm5
 ; SSE2-NEXT:    paddd %xmm5, %xmm0
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm5
+; SSE2-NEXT:    psadbw b+1040(%rax), %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm3
+; SSE2-NEXT:    movdqa a+1056(%rax), %xmm5
+; SSE2-NEXT:    psadbw b+1056(%rax), %xmm5
+; SSE2-NEXT:    paddd %xmm5, %xmm2
 ; SSE2-NEXT:    movdqa a+1072(%rax), %xmm5
 ; SSE2-NEXT:    psadbw b+1072(%rax), %xmm5
 ; SSE2-NEXT:    paddd %xmm5, %xmm1
@@ -364,22 +364,22 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB2_1: # %vector.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT:    vmovdqa a+1072(%rax), %xmm3
-; AVX1-NEXT:    vpsadbw b+1072(%rax), %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa a+1056(%rax), %xmm4
-; AVX1-NEXT:    vpsadbw b+1056(%rax), %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm3
-; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm4
-; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm4, %xmm4
-; AVX1-NEXT:    vpaddd %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vmovdqa a+1024(%rax), %xmm3
+; AVX1-NEXT:    vpsadbw b+1024(%rax), %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa a+1040(%rax), %xmm4
+; AVX1-NEXT:    vpsadbw b+1040(%rax), %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa a+1056(%rax), %xmm5
+; AVX1-NEXT:    vpsadbw b+1056(%rax), %xmm5, %xmm5
+; AVX1-NEXT:    vmovdqa a+1072(%rax), %xmm6
+; AVX1-NEXT:    vpsadbw b+1072(%rax), %xmm6, %xmm6
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6
+; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX1-NEXT:    addq $4, %rax
 ; AVX1-NEXT:    jne .LBB2_1
 ; AVX1-NEXT:  # %bb.2: # %middle.block
@@ -416,12 +416,12 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX2-NEXT:    .p2align 4, 0x90
 ; AVX2-NEXT:  .LBB2_1: # %vector.body
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT:    vmovdqa a+1056(%rax), %ymm3
-; AVX2-NEXT:    vpsadbw b+1056(%rax), %ymm3, %ymm3
-; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    vmovdqa a+1024(%rax), %ymm3
 ; AVX2-NEXT:    vpsadbw b+1024(%rax), %ymm3, %ymm3
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vmovdqa a+1056(%rax), %ymm3
+; AVX2-NEXT:    vpsadbw b+1056(%rax), %ymm3, %ymm3
+; AVX2-NEXT:    vpaddd %ymm2, %ymm3, %ymm2
 ; AVX2-NEXT:    addq $4, %rax
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # %bb.2: # %middle.block
@@ -449,11 +449,11 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX512F-NEXT:    .p2align 4, 0x90
 ; AVX512F-NEXT:  .LBB2_1: # %vector.body
 ; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX512F-NEXT:    vmovdqa a+1056(%rax), %ymm2
-; AVX512F-NEXT:    vpsadbw b+1056(%rax), %ymm2, %ymm2
-; AVX512F-NEXT:    vmovdqa a+1024(%rax), %ymm3
-; AVX512F-NEXT:    vpsadbw b+1024(%rax), %ymm3, %ymm3
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vmovdqa a+1024(%rax), %ymm2
+; AVX512F-NEXT:    vpsadbw b+1024(%rax), %ymm2, %ymm2
+; AVX512F-NEXT:    vmovdqa a+1056(%rax), %ymm3
+; AVX512F-NEXT:    vpsadbw b+1056(%rax), %ymm3, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
 ; AVX512F-NEXT:    addq $4, %rax
 ; AVX512F-NEXT:    jne .LBB2_1
@@ -554,10 +554,10 @@ define i32 @sad_2i8() nounwind {
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE2-NEXT:    pand %xmm1, %xmm3
 ; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    psadbw %xmm3, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm3
+; SSE2-NEXT:    psadbw %xmm2, %xmm3
+; SSE2-NEXT:    paddd %xmm3, %xmm0
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB3_1
 ; SSE2-NEXT:  # %bb.2: # %middle.block
@@ -576,8 +576,8 @@ define i32 @sad_2i8() nounwind {
 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpsadbw %xmm3, %xmm2, %xmm2
 ; AVX-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    addq $4, %rax
@@ -649,7 +649,7 @@ define i32 @sad_4i8() nounwind {
 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; AVX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    addq $4, %rax
 ; AVX-NEXT:    jne .LBB4_1
@@ -987,75 +987,36 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
 ; SSE2-NEXT:    movdqu (%rsi), %xmm1
 ; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movdqu (%rdx), %xmm0
+; SSE2-NEXT:    movdqu (%rcx), %xmm2
+; SSE2-NEXT:    psadbw %xmm0, %xmm2
 ; SSE2-NEXT:    movl $1, %eax
 ; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movdqu (%rdx), %xmm2
-; SSE2-NEXT:    movdqu (%rcx), %xmm3
-; SSE2-NEXT:    psadbw %xmm2, %xmm3
-; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm3, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
-; AVX1-LABEL: sad_unroll_nonzero_initial:
-; AVX1:       # %bb.0: # %bb
-; AVX1-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX1-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqu (%rdx), %xmm1
-; AVX1-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
-; AVX1-NEXT:    movl $1, %eax
-; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sad_unroll_nonzero_initial:
-; AVX2:       # %bb.0: # %bb
-; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX2-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX2-NEXT:    movl $1, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vmovdqu (%rdx), %xmm2
-; AVX2-NEXT:    vpsadbw (%rcx), %xmm2, %xmm2
-; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: sad_unroll_nonzero_initial:
-; AVX512:       # %bb.0: # %bb
-; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX512-NEXT:    movl $1, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
-; AVX512-NEXT:    vmovdqu (%rdx), %xmm2
-; AVX512-NEXT:    vpsadbw (%rcx), %xmm2, %xmm2
-; AVX512-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: sad_unroll_nonzero_initial:
+; AVX:       # %bb.0: # %bb
+; AVX-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu (%rdx), %xmm1
+; AVX-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
+; AVX-NEXT:    movl $1, %eax
+; AVX-NEXT:    vmovd %eax, %xmm2
+; AVX-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    retq
 bb:
   %tmp = load <16 x i8>, <16 x i8>* %arg, align 1
   %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1
@@ -1112,7 +1073,7 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
 ; AVX-NEXT:    vmovdqu (%rdx), %xmm1
 ; AVX-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
 ; AVX-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]