[llvm] 8f25e38 - [X86] Add basic vector handling for ISD::ABDS/ABDU (absolute difference) nodes

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sat Feb 4 03:26:06 PST 2023


Author: Simon Pilgrim
Date: 2023-02-04T11:25:51Z
New Revision: 8f25e382c5b1bbaafd738f44de856c85ce845bbe

URL: https://github.com/llvm/llvm-project/commit/8f25e382c5b1bbaafd738f44de856c85ce845bbe
DIFF: https://github.com/llvm/llvm-project/commit/8f25e382c5b1bbaafd738f44de856c85ce845bbe.diff

LOG: [X86] Add basic vector handling for ISD::ABDS/ABDU (absolute difference) nodes

I'm intending to add generic legalization in the future, but for now I've added basic support to targets that have the necessary MIN/MAX support to expand to SUB(MAX(X,Y),MIN(X,Y)).

This exposed a couple of issues with the DAG combines - in particular we need to catch trunc(abs(sub(ext(x),ext(y)))) patterns earlier before the SSE/AVX vector trunc expansion folds trigger.

Differential Revision: https://reviews.llvm.org/D142288

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/abds-vector-128.ll
    llvm/test/CodeGen/X86/abds-vector-256.ll
    llvm/test/CodeGen/X86/abds-vector-512.ll
    llvm/test/CodeGen/X86/abdu-vector-128.ll
    llvm/test/CodeGen/X86/abdu-vector-256.ll
    llvm/test/CodeGen/X86/abdu-vector-512.ll
    llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
    llvm/test/CodeGen/X86/midpoint-int-vec-512.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b61a2eef9dc21..273459f17622a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3815,7 +3815,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
         (N0.getOperand(0) != N1.getOperand(1) ||
          N0.getOperand(1) != N1.getOperand(0)))
       return SDValue();
-    if (!TLI.isOperationLegalOrCustom(Abd, VT))
+    if (!hasOperation(Abd, VT))
       return SDValue();
     return DAG.getNode(Abd, DL, VT, N0.getOperand(0), N0.getOperand(1));
   };
@@ -10159,13 +10159,23 @@ SDValue DAGCombiner::visitSHLSAT(SDNode *N) {
   return SDValue();
 }
 
-// Given a ABS node, detect the following pattern:
+// Given a ABS node, detect the following patterns:
 // (ABS (SUB (EXTEND a), (EXTEND b))).
+// (TRUNC (ABS (SUB (EXTEND a), (EXTEND b)))).
 // Generates UABD/SABD instruction.
 SDValue DAGCombiner::foldABSToABD(SDNode *N) {
+  EVT SrcVT = N->getValueType(0);
+
+  if (N->getOpcode() == ISD::TRUNCATE)
+    N = N->getOperand(0).getNode();
+
+  if (N->getOpcode() != ISD::ABS)
+    return SDValue();
+
   EVT VT = N->getValueType(0);
   SDValue AbsOp1 = N->getOperand(0);
   SDValue Op0, Op1;
+  SDLoc DL(N);
 
   if (AbsOp1.getOpcode() != ISD::SUB)
     return SDValue();
@@ -10178,9 +10188,12 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N) {
   if (Opc0 != Op1.getOpcode() ||
       (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND)) {
     // fold (abs (sub nsw x, y)) -> abds(x, y)
+    // Limit this to legal ops to prevent loss of sub_nsw pattern.
     if (AbsOp1->getFlags().hasNoSignedWrap() &&
-        TLI.isOperationLegalOrCustom(ISD::ABDS, VT))
-      return DAG.getNode(ISD::ABDS, SDLoc(N), VT, Op0, Op1);
+        TLI.isOperationLegal(ISD::ABDS, VT)) {
+      SDValue ABD = DAG.getNode(ISD::ABDS, DL, VT, Op0, Op1);
+      return DAG.getZExtOrTrunc(ABD, DL, SrcVT);
+    }
     return SDValue();
   }
 
@@ -10191,17 +10204,20 @@ SDValue DAGCombiner::foldABSToABD(SDNode *N) {
   // fold abs(sext(x) - sext(y)) -> zext(abds(x, y))
   // fold abs(zext(x) - zext(y)) -> zext(abdu(x, y))
   // NOTE: Extensions must be equivalent.
-  if (VT1 == VT2 && TLI.isOperationLegalOrCustom(ABDOpcode, VT1)) {
+  if (VT1 == VT2 && hasOperation(ABDOpcode, VT1)) {
     Op0 = Op0.getOperand(0);
     Op1 = Op1.getOperand(0);
-    SDValue ABD = DAG.getNode(ABDOpcode, SDLoc(N), VT1, Op0, Op1);
-    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, ABD);
+    SDValue ABD = DAG.getNode(ABDOpcode, DL, VT1, Op0, Op1);
+    ABD = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, ABD);
+    return DAG.getZExtOrTrunc(ABD, DL, SrcVT);
   }
 
   // fold abs(sext(x) - sext(y)) -> abds(sext(x), sext(y))
   // fold abs(zext(x) - zext(y)) -> abdu(zext(x), zext(y))
-  if (TLI.isOperationLegalOrCustom(ABDOpcode, VT))
-    return DAG.getNode(ABDOpcode, SDLoc(N), VT, Op0, Op1);
+  if (hasOperation(ABDOpcode, VT)) {
+    SDValue ABD = DAG.getNode(ABDOpcode, DL, VT, Op0, Op1);
+    return DAG.getZExtOrTrunc(ABD, DL, SrcVT);
+  }
 
   return SDValue();
 }
@@ -13948,6 +13964,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   if (SDValue V = foldSubToUSubSat(VT, N0.getNode()))
     return V;
 
+  if (SDValue ABD = foldABSToABD(N))
+    return ABD;
+
   // Attempt to pre-truncate BUILD_VECTOR sources.
   if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations &&
       TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&

diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a218871ae0908..fc60ae72cdbad 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1057,6 +1057,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
     }
 
+    setOperationAction(ISD::ABDU,               MVT::v16i8, Custom);
+    setOperationAction(ISD::ABDS,               MVT::v8i16, Custom);
+
     setOperationAction(ISD::UADDSAT,            MVT::v16i8, Legal);
     setOperationAction(ISD::SADDSAT,            MVT::v16i8, Legal);
     setOperationAction(ISD::USUBSAT,            MVT::v16i8, Legal);
@@ -1253,6 +1256,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) {
+      setOperationAction(ISD::ABDS,             VT, Custom);
+      setOperationAction(ISD::ABDU,             VT, Custom);
+    }
+
     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
     setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);
     setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);
@@ -1491,6 +1499,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
+      setOperationAction(ISD::ABDS, VT, Custom);
+      setOperationAction(ISD::ABDU, VT, Custom);
     }
 
     for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
@@ -1818,6 +1828,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SMIN,             VT, Legal);
       setOperationAction(ISD::UMIN,             VT, Legal);
       setOperationAction(ISD::ABS,              VT, Legal);
+      setOperationAction(ISD::ABDS,             VT, Custom);
+      setOperationAction(ISD::ABDU,             VT, Custom);
       setOperationAction(ISD::CTPOP,            VT, Custom);
       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
@@ -1825,6 +1837,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::ABDS,    VT, Custom);
+      setOperationAction(ISD::ABDU,    VT, Custom);
       setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
       setOperationAction(ISD::CTLZ,    VT, Custom);
       setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);
@@ -1954,6 +1968,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SMIN, VT, Legal);
       setOperationAction(ISD::UMIN, VT, Legal);
       setOperationAction(ISD::ABS,  VT, Legal);
+      setOperationAction(ISD::ABDS, VT, Custom);
+      setOperationAction(ISD::ABDU, VT, Custom);
     }
 
     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
@@ -29622,6 +29638,28 @@ static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
   return SDValue();
 }
 
+static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
+                        SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
+
+  // For AVX1 cases, split to use legal ops.
+  if (VT.is256BitVector() && !Subtarget.hasInt256())
+    return splitVectorIntBinary(Op, DAG);
+
+  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
+    return splitVectorIntBinary(Op, DAG);
+
+  // Default to expand: sub(smax(lhs,rhs),smin(lhs,rhs))
+  // TODO: Add TargetLowering expandABD() support.
+  SDLoc dl(Op);
+  bool IsSigned = Op.getOpcode() == ISD::ABDS;
+  SDValue LHS = DAG.getFreeze(Op.getOperand(0));
+  SDValue RHS = DAG.getFreeze(Op.getOperand(1));
+  SDValue Max = DAG.getNode(IsSigned ? ISD::SMAX : ISD::UMAX, dl, VT, LHS, RHS);
+  SDValue Min = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, dl, VT, LHS, RHS);
+  return DAG.getNode(ISD::SUB, dl, VT, Max, Min);
+}
+
 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
                         SelectionDAG &DAG) {
   SDLoc dl(Op);
@@ -33325,6 +33363,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UMAX:
   case ISD::UMIN:               return LowerMINMAX(Op, Subtarget, DAG);
   case ISD::ABS:                return LowerABS(Op, Subtarget, DAG);
+  case ISD::ABDS:
+  case ISD::ABDU:               return LowerABD(Op, Subtarget, DAG);
   case ISD::AVGCEILU:           return LowerAVG(Op, Subtarget, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);

diff  --git a/llvm/test/CodeGen/X86/abds-vector-128.ll b/llvm/test/CodeGen/X86/abds-vector-128.ll
index 41384b1489ce1..6b2d56a833ce6 100644
--- a/llvm/test/CodeGen/X86/abds-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-128.ll
@@ -142,250 +142,17 @@ define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE42-LABEL: abd_ext_v16i8:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    psrld $16, %xmm2
-; SSE42-NEXT:    pmovsxbq %xmm2, %xmm2
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm9
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; SSE42-NEXT:    pmovsxbq %xmm3, %xmm3
-; SSE42-NEXT:    movdqa %xmm0, %xmm4
-; SSE42-NEXT:    psrlq $48, %xmm4
-; SSE42-NEXT:    pmovsxbq %xmm4, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; SSE42-NEXT:    pmovsxbq %xmm4, %xmm4
-; SSE42-NEXT:    movdqa %xmm0, %xmm6
-; SSE42-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovsxbq %xmm6, %xmm6
-; SSE42-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; SSE42-NEXT:    pmovsxbq %xmm7, %xmm7
-; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm8
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrld $16, %xmm0
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm2
-; SSE42-NEXT:    pmovsxbq %xmm1, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm9
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrlq $48, %xmm0
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm4
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm6
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm7
-; SSE42-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovsxbq %xmm1, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm8
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm2, %xmm10
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm9, %xmm10
-; SSE42-NEXT:    movdqa %xmm9, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm9
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm3, %xmm10
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm3
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm5, %xmm10
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm5
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm4, %xmm10
-; SSE42-NEXT:    movdqa %xmm4, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm4
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm6, %xmm10
-; SSE42-NEXT:    movdqa %xmm6, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm6
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm7, %xmm10
-; SSE42-NEXT:    movdqa %xmm7, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm7
-; SSE42-NEXT:    psubq %xmm8, %xmm1
-; SSE42-NEXT:    movdqa %xmm8, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm8
-; SSE42-NEXT:    movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE42-NEXT:    andpd %xmm0, %xmm8
-; SSE42-NEXT:    andpd %xmm0, %xmm7
-; SSE42-NEXT:    packusdw %xmm8, %xmm7
-; SSE42-NEXT:    andpd %xmm0, %xmm6
-; SSE42-NEXT:    andpd %xmm0, %xmm4
-; SSE42-NEXT:    packusdw %xmm6, %xmm4
-; SSE42-NEXT:    packusdw %xmm7, %xmm4
-; SSE42-NEXT:    andpd %xmm0, %xmm5
-; SSE42-NEXT:    andpd %xmm0, %xmm3
-; SSE42-NEXT:    packusdw %xmm5, %xmm3
-; SSE42-NEXT:    andpd %xmm0, %xmm9
-; SSE42-NEXT:    andpd %xmm0, %xmm2
-; SSE42-NEXT:    packusdw %xmm2, %xmm9
-; SSE42-NEXT:    packusdw %xmm3, %xmm9
-; SSE42-NEXT:    packuswb %xmm4, %xmm9
-; SSE42-NEXT:    movdqa %xmm9, %xmm0
+; SSE42-NEXT:    pminsb %xmm1, %xmm2
+; SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE42-NEXT:    psubb %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: abd_ext_v16i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm7, %xmm7
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm6, %xmm6
-; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm8
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm9, %xmm10
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm11, %xmm11
-; AVX1-NEXT:    vpsubq %xmm11, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm11, %xmm11
-; AVX1-NEXT:    vpsubq %xmm11, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm10, %xmm10
-; AVX1-NEXT:    vpsubq %xmm10, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxbq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm9
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm10, %xmm10
-; AVX1-NEXT:    vpsubq %xmm10, %xmm7, %xmm7
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm6, %xmm6
-; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm8, %xmm8
-; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm11
-; AVX1-NEXT:    vpxor %xmm12, %xmm12, %xmm12
-; AVX1-NEXT:    vpsubq %xmm5, %xmm12, %xmm5
-; AVX1-NEXT:    vpsubq %xmm4, %xmm12, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm11, %ymm4, %ymm11, %ymm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm12, %xmm2
-; AVX1-NEXT:    vpsubq %xmm3, %xmm12, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm10, %ymm2, %ymm10, %ymm2
-; AVX1-NEXT:    vpsubq %xmm6, %xmm12, %xmm3
-; AVX1-NEXT:    vpsubq %xmm7, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm3, %ymm9, %ymm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm12, %xmm0
-; AVX1-NEXT:    vpsubq %xmm8, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [255,255,255,255]
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm1, %ymm4, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm3, %ymm3
-; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm4, %ymm4
-; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm5
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm6, %ymm6
-; AVX2-NEXT:    vpsubq %ymm6, %ymm3, %ymm3
-; AVX2-NEXT:    vpmovsxbq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm5, %ymm5
-; AVX2-NEXT:    vpsubq %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubq %ymm4, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX2-NEXT:    vandpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vandpd %ymm2, %ymm4, %ymm4
-; AVX2-NEXT:    vpackusdw %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vandpd %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: abd_ext_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm3
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = sext <16 x i8> %a to <16 x i64>
   %bext = sext <16 x i8> %b to <16 x i64>
   %sub = sub <16 x i64> %aext, %bext
@@ -527,250 +294,17 @@ define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE42-LABEL: abd_ext_v16i8_undef:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    psrld $16, %xmm2
-; SSE42-NEXT:    pmovsxbq %xmm2, %xmm2
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm9
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; SSE42-NEXT:    pmovsxbq %xmm3, %xmm3
-; SSE42-NEXT:    movdqa %xmm0, %xmm4
-; SSE42-NEXT:    psrlq $48, %xmm4
-; SSE42-NEXT:    pmovsxbq %xmm4, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; SSE42-NEXT:    pmovsxbq %xmm4, %xmm4
-; SSE42-NEXT:    movdqa %xmm0, %xmm6
-; SSE42-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovsxbq %xmm6, %xmm6
-; SSE42-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; SSE42-NEXT:    pmovsxbq %xmm7, %xmm7
-; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm8
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrld $16, %xmm0
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm2
-; SSE42-NEXT:    pmovsxbq %xmm1, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm9
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrlq $48, %xmm0
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm4
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm6
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; SSE42-NEXT:    pmovsxbq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm7
-; SSE42-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovsxbq %xmm1, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm8
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm2, %xmm10
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm9, %xmm10
-; SSE42-NEXT:    movdqa %xmm9, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm9
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm3, %xmm10
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm3
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm5, %xmm10
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm5
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm4, %xmm10
-; SSE42-NEXT:    movdqa %xmm4, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm4
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm6, %xmm10
-; SSE42-NEXT:    movdqa %xmm6, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm6
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm7, %xmm10
-; SSE42-NEXT:    movdqa %xmm7, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm7
-; SSE42-NEXT:    psubq %xmm8, %xmm1
-; SSE42-NEXT:    movdqa %xmm8, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm8
-; SSE42-NEXT:    movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE42-NEXT:    andpd %xmm0, %xmm8
-; SSE42-NEXT:    andpd %xmm0, %xmm7
-; SSE42-NEXT:    packusdw %xmm8, %xmm7
-; SSE42-NEXT:    andpd %xmm0, %xmm6
-; SSE42-NEXT:    andpd %xmm0, %xmm4
-; SSE42-NEXT:    packusdw %xmm6, %xmm4
-; SSE42-NEXT:    packusdw %xmm7, %xmm4
-; SSE42-NEXT:    andpd %xmm0, %xmm5
-; SSE42-NEXT:    andpd %xmm0, %xmm3
-; SSE42-NEXT:    packusdw %xmm5, %xmm3
-; SSE42-NEXT:    andpd %xmm0, %xmm9
-; SSE42-NEXT:    andpd %xmm0, %xmm2
-; SSE42-NEXT:    packusdw %xmm2, %xmm9
-; SSE42-NEXT:    packusdw %xmm3, %xmm9
-; SSE42-NEXT:    packuswb %xmm4, %xmm9
-; SSE42-NEXT:    movdqa %xmm9, %xmm0
+; SSE42-NEXT:    pminsb %xmm1, %xmm2
+; SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; SSE42-NEXT:    psubb %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: abd_ext_v16i8_undef:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm7, %xmm7
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm6, %xmm6
-; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm8
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm9, %xmm10
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm11, %xmm11
-; AVX1-NEXT:    vpsubq %xmm11, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm11, %xmm11
-; AVX1-NEXT:    vpsubq %xmm11, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm10, %xmm10
-; AVX1-NEXT:    vpsubq %xmm10, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxbq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm9
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm10, %xmm10
-; AVX1-NEXT:    vpsubq %xmm10, %xmm7, %xmm7
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm6, %xmm6
-; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm8, %xmm8
-; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm11
-; AVX1-NEXT:    vpxor %xmm12, %xmm12, %xmm12
-; AVX1-NEXT:    vpsubq %xmm5, %xmm12, %xmm5
-; AVX1-NEXT:    vpsubq %xmm4, %xmm12, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm11, %ymm4, %ymm11, %ymm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm12, %xmm2
-; AVX1-NEXT:    vpsubq %xmm3, %xmm12, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm10, %ymm2, %ymm10, %ymm2
-; AVX1-NEXT:    vpsubq %xmm6, %xmm12, %xmm3
-; AVX1-NEXT:    vpsubq %xmm7, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm3, %ymm9, %ymm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm12, %xmm0
-; AVX1-NEXT:    vpsubq %xmm8, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [255,255,255,255]
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm1, %ymm4, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v16i8_undef:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm3, %ymm3
-; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm4, %ymm4
-; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm5
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm6, %ymm6
-; AVX2-NEXT:    vpsubq %ymm6, %ymm3, %ymm3
-; AVX2-NEXT:    vpmovsxbq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm5, %ymm5
-; AVX2-NEXT:    vpsubq %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubq %ymm4, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX2-NEXT:    vandpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vandpd %ymm2, %ymm4, %ymm4
-; AVX2-NEXT:    vpackusdw %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vandpd %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: abd_ext_v16i8_undef:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm3
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v16i8_undef:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = sext <16 x i8> %a to <16 x i64>
   %bext = sext <16 x i8> %b to <16 x i64>
   %sub = sub <16 x i64> %aext, %bext
@@ -780,195 +314,20 @@ define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind {
 }
 
 define <8 x i16> @abd_ext_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
-; SSE2-LABEL: abd_ext_v8i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
-; SSE2-NEXT:    psrad $16, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT:    movdqa %xmm6, %xmm8
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; SSE2-NEXT:    psubq %xmm8, %xmm3
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; SSE2-NEXT:    psubq %xmm6, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; SSE2-NEXT:    psubq %xmm6, %xmm4
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    psubq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: abd_ext_v8i16:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; SSE42-NEXT:    pmovsxwq %xmm2, %xmm2
-; SSE42-NEXT:    pmovsxwq %xmm0, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; SSE42-NEXT:    pmovsxwq %xmm3, %xmm3
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE42-NEXT:    pmovsxwq %xmm0, %xmm4
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE42-NEXT:    pmovsxwq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm2
-; SSE42-NEXT:    pmovsxwq %xmm1, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE42-NEXT:    pmovsxwq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; SSE42-NEXT:    pmovsxwq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm4
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm2, %xmm6
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm5, %xmm6
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm3, %xmm6
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm4, %xmm6
-; SSE42-NEXT:    movdqa %xmm4, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm4
-; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7]
-; SSE42-NEXT:    packusdw %xmm4, %xmm3
-; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7]
-; SSE42-NEXT:    packusdw %xmm2, %xmm5
-; SSE42-NEXT:    packusdw %xmm3, %xmm5
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    retq
-;
-; AVX1-LABEL: abd_ext_v8i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
-; AVX1-NEXT:    vpsubq %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm5
-; AVX1-NEXT:    vpsubq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm5
-; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpsubq %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm5, %ymm2
-; AVX1-NEXT:    vpsubq %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpsubq %xmm4, %xmm6, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535]
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v8i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
-; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; SSE-LABEL: abd_ext_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pminsw %xmm1, %xmm2
+; SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE-NEXT:    psubw %xmm2, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX512-LABEL: abd_ext_v8i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = sext <8 x i16> %a to <8 x i64>
   %bext = sext <8 x i16> %b to <8 x i64>
   %sub = sub <8 x i64> %aext, %bext
@@ -978,195 +337,20 @@ define <8 x i16> @abd_ext_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 }
 
 define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind {
-; SSE2-LABEL: abd_ext_v8i16_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    psrad $16, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
-; SSE2-NEXT:    psrad $16, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT:    movdqa %xmm6, %xmm8
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; SSE2-NEXT:    psubq %xmm8, %xmm3
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; SSE2-NEXT:    psubq %xmm6, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT:    movdqa %xmm1, %xmm6
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; SSE2-NEXT:    psubq %xmm6, %xmm4
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    psubq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: abd_ext_v8i16_undef:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; SSE42-NEXT:    pmovsxwq %xmm2, %xmm2
-; SSE42-NEXT:    pmovsxwq %xmm0, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; SSE42-NEXT:    pmovsxwq %xmm3, %xmm3
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE42-NEXT:    pmovsxwq %xmm0, %xmm4
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE42-NEXT:    pmovsxwq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm2
-; SSE42-NEXT:    pmovsxwq %xmm1, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE42-NEXT:    pmovsxwq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; SSE42-NEXT:    pmovsxwq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm4
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm2, %xmm6
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm5, %xmm6
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm3, %xmm6
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm4, %xmm6
-; SSE42-NEXT:    movdqa %xmm4, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm4
-; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7]
-; SSE42-NEXT:    packusdw %xmm4, %xmm3
-; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7]
-; SSE42-NEXT:    packusdw %xmm2, %xmm5
-; SSE42-NEXT:    packusdw %xmm3, %xmm5
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    retq
-;
-; AVX1-LABEL: abd_ext_v8i16_undef:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
-; AVX1-NEXT:    vpsubq %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm5
-; AVX1-NEXT:    vpsubq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm5
-; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpsubq %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm5, %ymm2
-; AVX1-NEXT:    vpsubq %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpsubq %xmm4, %xmm6, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535]
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v8i16_undef:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
-; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; SSE-LABEL: abd_ext_v8i16_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pminsw %xmm1, %xmm2
+; SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE-NEXT:    psubw %xmm2, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX512-LABEL: abd_ext_v8i16_undef:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v8i16_undef:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = sext <8 x i16> %a to <8 x i64>
   %bext = sext <8 x i16> %b to <8 x i64>
   %sub = sub <8 x i64> %aext, %bext
@@ -1209,69 +393,18 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; SSE42-LABEL: abd_ext_v4i32:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    pmovsxdq %xmm0, %xmm2
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE42-NEXT:    pmovsxdq %xmm0, %xmm3
-; SSE42-NEXT:    pmovsxdq %xmm1, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm2
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE42-NEXT:    pmovsxdq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm4, %xmm4
-; SSE42-NEXT:    psubq %xmm2, %xmm4
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; SSE42-NEXT:    psubq %xmm3, %xmm1
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; SSE42-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
-; SSE42-NEXT:    movaps %xmm2, %xmm0
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pminsd %xmm1, %xmm2
+; SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE42-NEXT:    psubd %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: abd_ext_v4i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm3
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm1
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v4i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: abd_ext_v4i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
-; AVX512-NEXT:    vpmovsxdq %xmm1, %ymm1
-; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpabsq %ymm0, %ymm0
-; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = sext <4 x i32> %a to <4 x i64>
   %bext = sext <4 x i32> %b to <4 x i64>
   %sub = sub <4 x i64> %aext, %bext
@@ -1314,69 +447,18 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 ;
 ; SSE42-LABEL: abd_ext_v4i32_undef:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    pmovsxdq %xmm0, %xmm2
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; SSE42-NEXT:    pmovsxdq %xmm0, %xmm3
-; SSE42-NEXT:    pmovsxdq %xmm1, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm2
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE42-NEXT:    pmovsxdq %xmm0, %xmm0
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm4, %xmm4
-; SSE42-NEXT:    psubq %xmm2, %xmm4
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; SSE42-NEXT:    psubq %xmm3, %xmm1
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; SSE42-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
-; SSE42-NEXT:    movaps %xmm2, %xmm0
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pminsd %xmm1, %xmm2
+; SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; SSE42-NEXT:    psubd %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: abd_ext_v4i32_undef:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm3
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm1
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v4i32_undef:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: abd_ext_v4i32_undef:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxdq %xmm0, %ymm0
-; AVX512-NEXT:    vpmovsxdq %xmm1, %ymm1
-; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpabsq %ymm0, %ymm0
-; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v4i32_undef:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = sext <4 x i32> %a to <4 x i64>
   %bext = sext <4 x i32> %b to <4 x i64>
   %sub = sub <4 x i64> %aext, %bext
@@ -1446,34 +528,70 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE42-NEXT:    retq
 ;
-; AVX-LABEL: abd_ext_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    movq %rax, %rcx
-; AVX-NEXT:    sarq $63, %rcx
-; AVX-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX-NEXT:    movq %rdx, %rsi
-; AVX-NEXT:    sarq $63, %rsi
-; AVX-NEXT:    vmovq %xmm1, %rdi
-; AVX-NEXT:    movq %rdi, %r8
-; AVX-NEXT:    sarq $63, %r8
-; AVX-NEXT:    vpextrq $1, %xmm1, %r9
-; AVX-NEXT:    movq %r9, %r10
-; AVX-NEXT:    sarq $63, %r10
-; AVX-NEXT:    subq %r9, %rdx
-; AVX-NEXT:    sbbq %r10, %rsi
-; AVX-NEXT:    subq %rdi, %rax
-; AVX-NEXT:    sbbq %r8, %rcx
-; AVX-NEXT:    sarq $63, %rcx
-; AVX-NEXT:    xorq %rcx, %rax
-; AVX-NEXT:    subq %rcx, %rax
-; AVX-NEXT:    sarq $63, %rsi
-; AVX-NEXT:    xorq %rsi, %rdx
-; AVX-NEXT:    subq %rsi, %rdx
-; AVX-NEXT:    vmovq %rdx, %xmm0
-; AVX-NEXT:    vmovq %rax, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: abd_ext_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT:    movq %rdx, %rsi
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vmovq %xmm1, %rdi
+; AVX1-NEXT:    movq %rdi, %r8
+; AVX1-NEXT:    sarq $63, %r8
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX1-NEXT:    movq %r9, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    subq %r9, %rdx
+; AVX1-NEXT:    sbbq %r10, %rsi
+; AVX1-NEXT:    subq %rdi, %rax
+; AVX1-NEXT:    sbbq %r8, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    subq %rcx, %rax
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    xorq %rsi, %rdx
+; AVX1-NEXT:    subq %rsi, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm0
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: abd_ext_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    movq %rdx, %rsi
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vmovq %xmm1, %rdi
+; AVX2-NEXT:    movq %rdi, %r8
+; AVX2-NEXT:    sarq $63, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX2-NEXT:    movq %r9, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    subq %r9, %rdx
+; AVX2-NEXT:    sbbq %r10, %rsi
+; AVX2-NEXT:    subq %rdi, %rax
+; AVX2-NEXT:    sbbq %r8, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    subq %rcx, %rax
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    xorq %rsi, %rdx
+; AVX2-NEXT:    subq %rsi, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm0
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: abd_ext_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %aext = sext <2 x i64> %a to <2 x i128>
   %bext = sext <2 x i64> %b to <2 x i128>
   %sub = sub <2 x i128> %aext, %bext
@@ -1543,34 +661,70 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE42-NEXT:    retq
 ;
-; AVX-LABEL: abd_ext_v2i64_undef:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    movq %rax, %rcx
-; AVX-NEXT:    sarq $63, %rcx
-; AVX-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX-NEXT:    movq %rdx, %rsi
-; AVX-NEXT:    sarq $63, %rsi
-; AVX-NEXT:    vmovq %xmm1, %rdi
-; AVX-NEXT:    movq %rdi, %r8
-; AVX-NEXT:    sarq $63, %r8
-; AVX-NEXT:    vpextrq $1, %xmm1, %r9
-; AVX-NEXT:    movq %r9, %r10
-; AVX-NEXT:    sarq $63, %r10
-; AVX-NEXT:    subq %r9, %rdx
-; AVX-NEXT:    sbbq %r10, %rsi
-; AVX-NEXT:    subq %rdi, %rax
-; AVX-NEXT:    sbbq %r8, %rcx
-; AVX-NEXT:    sarq $63, %rcx
-; AVX-NEXT:    xorq %rcx, %rax
-; AVX-NEXT:    subq %rcx, %rax
-; AVX-NEXT:    sarq $63, %rsi
-; AVX-NEXT:    xorq %rsi, %rdx
-; AVX-NEXT:    subq %rsi, %rdx
-; AVX-NEXT:    vmovq %rdx, %xmm0
-; AVX-NEXT:    vmovq %rax, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: abd_ext_v2i64_undef:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT:    movq %rdx, %rsi
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    vmovq %xmm1, %rdi
+; AVX1-NEXT:    movq %rdi, %r8
+; AVX1-NEXT:    sarq $63, %r8
+; AVX1-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX1-NEXT:    movq %r9, %r10
+; AVX1-NEXT:    sarq $63, %r10
+; AVX1-NEXT:    subq %r9, %rdx
+; AVX1-NEXT:    sbbq %r10, %rsi
+; AVX1-NEXT:    subq %rdi, %rax
+; AVX1-NEXT:    sbbq %r8, %rcx
+; AVX1-NEXT:    sarq $63, %rcx
+; AVX1-NEXT:    xorq %rcx, %rax
+; AVX1-NEXT:    subq %rcx, %rax
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    xorq %rsi, %rdx
+; AVX1-NEXT:    subq %rsi, %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm0
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: abd_ext_v2i64_undef:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    movq %rdx, %rsi
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    vmovq %xmm1, %rdi
+; AVX2-NEXT:    movq %rdi, %r8
+; AVX2-NEXT:    sarq $63, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX2-NEXT:    movq %r9, %r10
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    subq %r9, %rdx
+; AVX2-NEXT:    sbbq %r10, %rsi
+; AVX2-NEXT:    subq %rdi, %rax
+; AVX2-NEXT:    sbbq %r8, %rcx
+; AVX2-NEXT:    sarq $63, %rcx
+; AVX2-NEXT:    xorq %rcx, %rax
+; AVX2-NEXT:    subq %rcx, %rax
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    xorq %rsi, %rdx
+; AVX2-NEXT:    subq %rsi, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm0
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: abd_ext_v2i64_undef:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %aext = sext <2 x i64> %a to <2 x i128>
   %bext = sext <2 x i64> %b to <2 x i128>
   %sub = sub <2 x i128> %aext, %bext

diff  --git a/llvm/test/CodeGen/X86/abds-vector-256.ll b/llvm/test/CodeGen/X86/abds-vector-256.ll
index b24903fe1199a..afb2107ad3c56 100644
--- a/llvm/test/CodeGen/X86/abds-vector-256.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-256.ll
@@ -10,296 +10,29 @@
 define <32 x i8> @abd_ext_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm7
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxbq %xmm4, %xmm10
-; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm4, %xmm11
-; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm4
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm8
-; AVX1-NEXT:    vpmovsxbq %xmm8, %xmm3
-; AVX1-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm8, %xmm12
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm13, %xmm14
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm15, %xmm15
-; AVX1-NEXT:    vpsubq %xmm15, %xmm6, %xmm0
-; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm15, %xmm15
-; AVX1-NEXT:    vpsubq %xmm15, %xmm7, %xmm7
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm14, %xmm14
-; AVX1-NEXT:    vpsubq %xmm14, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpmovsxbq %xmm13, %xmm2
-; AVX1-NEXT:    vpsubq %xmm2, %xmm10, %xmm6
-; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm13 = xmm2[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm13, %xmm13
-; AVX1-NEXT:    vpsubq %xmm13, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm2, %xmm11, %xmm11
-; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm2
-; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm0
-; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm2
-; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm10
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm13
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm14, %xmm14
-; AVX1-NEXT:    vpsubq %xmm14, %xmm9, %xmm9
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm14 = xmm12[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm14, %xmm14
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm15 = xmm13[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm15, %xmm15
-; AVX1-NEXT:    vpsubq %xmm15, %xmm14, %xmm15
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm12, %xmm12
-; AVX1-NEXT:    vpmovsxbq %xmm8, %xmm14
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm13[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm8, %xmm8
-; AVX1-NEXT:    vpsubq %xmm8, %xmm12, %xmm8
-; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm12
-; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm14, %xmm14
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm12[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
-; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm13, %xmm13
-; AVX1-NEXT:    vpsubq %xmm13, %xmm1, %xmm13
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm12[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovsxbq %xmm3, %xmm0
-; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm12
-; AVX1-NEXT:    vpsubq %xmm12, %xmm0, %xmm12
-; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm0
-; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm4, %ymm3
-; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm7, %xmm0, %xmm7
-; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm4, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm3, %ymm4, %ymm3, %ymm3
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm4
-; AVX1-NEXT:    vpsubq %xmm6, %xmm0, %xmm6
-; AVX1-NEXT:    vpsubq %xmm7, %xmm0, %xmm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-NEXT:    vblendvpd %ymm4, %ymm6, %ymm4, %ymm6
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm5, %ymm4
-; AVX1-NEXT:    vpsubq %xmm11, %xmm0, %xmm7
-; AVX1-NEXT:    vpsubq %xmm5, %xmm0, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
-; AVX1-NEXT:    vblendvpd %ymm4, %ymm5, %ymm4, %ymm5
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm11, %ymm4
-; AVX1-NEXT:    vpsubq %xmm10, %xmm0, %xmm7
-; AVX1-NEXT:    vpsubq %xmm11, %xmm0, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm10, %ymm7
-; AVX1-NEXT:    vblendvpd %ymm4, %ymm7, %ymm4, %ymm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm15, %ymm9, %ymm7
-; AVX1-NEXT:    vpsubq %xmm15, %xmm0, %xmm10
-; AVX1-NEXT:    vpsubq %xmm9, %xmm0, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vblendvpd %ymm7, %ymm9, %ymm7, %ymm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm14, %ymm9
-; AVX1-NEXT:    vpsubq %xmm14, %xmm0, %xmm10
-; AVX1-NEXT:    vpsubq %xmm8, %xmm0, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm10, %ymm8
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm8, %ymm9, %ymm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm13, %ymm9
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpsubq %xmm13, %xmm0, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm10, %ymm1
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm1, %ymm9, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm12, %ymm9
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm12, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm0, %ymm9, %ymm2
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm0 = [255,255,255,255]
-; AVX1-NEXT:    vandpd %ymm0, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT:    vpackusdw %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm0, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9
-; AVX1-NEXT:    vpackusdw %xmm9, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vandpd %ymm0, %ymm8, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm8
-; AVX1-NEXT:    vpackusdw %xmm8, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm0, %ymm7, %ymm7
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm8
-; AVX1-NEXT:    vpackusdw %xmm8, %xmm7, %xmm7
-; AVX1-NEXT:    vpackusdw %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vandpd %ymm0, %ymm4, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm0, %ymm5, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm0, %ymm6, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandpd %ymm0, %ymm3, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm3, %ymm3
-; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm4
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm5, %ymm5
-; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm6
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm7, %ymm7
-; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm8, %ymm8
-; AVX2-NEXT:    vpmovsxwq %xmm6, %ymm6
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm9
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm10, %ymm10
-; AVX2-NEXT:    vpsubq %ymm10, %ymm3, %ymm3
-; AVX2-NEXT:    vpmovsxbq %xmm1, %ymm10
-; AVX2-NEXT:    vpsubq %ymm10, %ymm4, %ymm4
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm10, %ymm10
-; AVX2-NEXT:    vpsubq %ymm10, %ymm5, %ymm5
-; AVX2-NEXT:    vpmovsxwq %xmm9, %ymm9
-; AVX2-NEXT:    vpsubq %ymm9, %ymm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm9
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm10, %ymm10
-; AVX2-NEXT:    vpsubq %ymm10, %ymm7, %ymm7
-; AVX2-NEXT:    vpmovsxbq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm9, %ymm9
-; AVX2-NEXT:    vpsubq %ymm9, %ymm8, %ymm8
-; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm6, %ymm1
-; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX2-NEXT:    vpsubq %ymm3, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm9, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm4, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm9, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm5, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm5, %ymm9, %ymm5, %ymm5
-; AVX2-NEXT:    vpsubq %ymm2, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm9, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm7, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm7, %ymm9, %ymm7, %ymm7
-; AVX2-NEXT:    vpsubq %ymm0, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm9, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubq %ymm8, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm8, %ymm9, %ymm8, %ymm8
-; AVX2-NEXT:    vpsubq %ymm1, %ymm6, %ymm6
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [255,255,255,255]
-; AVX2-NEXT:    vandpd %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vandpd %ymm6, %ymm8, %ymm8
-; AVX2-NEXT:    vpackusdw %ymm8, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm0, %ymm0
-; AVX2-NEXT:    vandpd %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpackusdw %ymm7, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm2, %ymm1
-; AVX2-NEXT:    vandpd %ymm6, %ymm5, %ymm2
-; AVX2-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm4, %ymm2
-; AVX2-NEXT:    vandpd %ymm6, %ymm3, %ymm3
-; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vpackuswb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v32i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm4
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm5
-; AVX512-NEXT:    vpmovsxwq %xmm5, %zmm6
-; AVX512-NEXT:    vpsubq %zmm6, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512-NEXT:    vpmovsxwq %xmm5, %zmm5
-; AVX512-NEXT:    vpsubq %zmm5, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm5
-; AVX512-NEXT:    vpsubq %zmm5, %zmm4, %zmm4
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm3, %zmm1
-; AVX512-NEXT:    vpabsq %zmm2, %zmm2
-; AVX512-NEXT:    vpabsq %zmm4, %zmm3
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = sext <32 x i8> %a to <32 x i64>
   %bext = sext <32 x i8> %b to <32 x i64>
@@ -312,296 +45,29 @@ define <32 x i8> @abd_ext_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <32 x i8> @abd_ext_v32i8_undef(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v32i8_undef:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm4, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm7
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxbq %xmm4, %xmm10
-; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm4, %xmm11
-; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm4
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm8
-; AVX1-NEXT:    vpmovsxbq %xmm8, %xmm3
-; AVX1-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm3[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm8, %xmm12
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm12[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm13 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm13, %xmm14
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm15, %xmm15
-; AVX1-NEXT:    vpsubq %xmm15, %xmm6, %xmm0
-; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm15, %xmm15
-; AVX1-NEXT:    vpsubq %xmm15, %xmm7, %xmm7
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm14 = xmm14[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm14, %xmm14
-; AVX1-NEXT:    vpsubq %xmm14, %xmm2, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT:    vpmovsxbq %xmm13, %xmm2
-; AVX1-NEXT:    vpsubq %xmm2, %xmm10, %xmm6
-; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm13 = xmm2[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm13, %xmm13
-; AVX1-NEXT:    vpsubq %xmm13, %xmm5, %xmm5
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm2, %xmm11, %xmm11
-; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm2
-; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm0
-; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm2
-; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm10
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm13
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm14 = xmm13[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm14, %xmm14
-; AVX1-NEXT:    vpsubq %xmm14, %xmm9, %xmm9
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm14 = xmm12[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm14, %xmm14
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm15 = xmm13[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm15, %xmm15
-; AVX1-NEXT:    vpsubq %xmm15, %xmm14, %xmm15
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm12, %xmm12
-; AVX1-NEXT:    vpmovsxbq %xmm8, %xmm14
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm13[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm8, %xmm8
-; AVX1-NEXT:    vpsubq %xmm8, %xmm12, %xmm8
-; AVX1-NEXT:    vpmovsxbw %xmm3, %xmm12
-; AVX1-NEXT:    vpmovsxbq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm14, %xmm14
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm12[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
-; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm13, %xmm13
-; AVX1-NEXT:    vpsubq %xmm13, %xmm1, %xmm13
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm12[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovsxbq %xmm3, %xmm0
-; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm12
-; AVX1-NEXT:    vpsubq %xmm12, %xmm0, %xmm12
-; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm0
-; AVX1-NEXT:    vpmovsxbq %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxbq %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm4, %ymm3
-; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubq %xmm7, %xmm0, %xmm7
-; AVX1-NEXT:    vpsubq %xmm4, %xmm0, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm4, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm3, %ymm4, %ymm3, %ymm3
-; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm4
-; AVX1-NEXT:    vpsubq %xmm6, %xmm0, %xmm6
-; AVX1-NEXT:    vpsubq %xmm7, %xmm0, %xmm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX1-NEXT:    vblendvpd %ymm4, %ymm6, %ymm4, %ymm6
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm5, %ymm4
-; AVX1-NEXT:    vpsubq %xmm11, %xmm0, %xmm7
-; AVX1-NEXT:    vpsubq %xmm5, %xmm0, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
-; AVX1-NEXT:    vblendvpd %ymm4, %ymm5, %ymm4, %ymm5
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm11, %ymm4
-; AVX1-NEXT:    vpsubq %xmm10, %xmm0, %xmm7
-; AVX1-NEXT:    vpsubq %xmm11, %xmm0, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm10, %ymm7
-; AVX1-NEXT:    vblendvpd %ymm4, %ymm7, %ymm4, %ymm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm15, %ymm9, %ymm7
-; AVX1-NEXT:    vpsubq %xmm15, %xmm0, %xmm10
-; AVX1-NEXT:    vpsubq %xmm9, %xmm0, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm10, %ymm9, %ymm9
-; AVX1-NEXT:    vblendvpd %ymm7, %ymm9, %ymm7, %ymm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm14, %ymm9
-; AVX1-NEXT:    vpsubq %xmm14, %xmm0, %xmm10
-; AVX1-NEXT:    vpsubq %xmm8, %xmm0, %xmm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm10, %ymm8
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm8, %ymm9, %ymm8
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm13, %ymm9
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vpsubq %xmm13, %xmm0, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm10, %ymm1
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm1, %ymm9, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm12, %ymm9
-; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpsubq %xmm12, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm0, %ymm9, %ymm2
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm0 = [255,255,255,255]
-; AVX1-NEXT:    vandpd %ymm0, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT:    vpackusdw %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm0, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm9
-; AVX1-NEXT:    vpackusdw %xmm9, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vandpd %ymm0, %ymm8, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm8
-; AVX1-NEXT:    vpackusdw %xmm8, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm0, %ymm7, %ymm7
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm8
-; AVX1-NEXT:    vpackusdw %xmm8, %xmm7, %xmm7
-; AVX1-NEXT:    vpackusdw %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vandpd %ymm0, %ymm4, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm0, %ymm5, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm0, %ymm6, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandpd %ymm0, %ymm3, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm4, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v32i8_undef:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm3, %ymm3
-; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm4
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm5, %ymm5
-; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm6
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm7, %ymm7
-; AVX2-NEXT:    vpmovsxbq %xmm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm8, %ymm8
-; AVX2-NEXT:    vpmovsxwq %xmm6, %ymm6
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm9
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm10, %ymm10
-; AVX2-NEXT:    vpsubq %ymm10, %ymm3, %ymm3
-; AVX2-NEXT:    vpmovsxbq %xmm1, %ymm10
-; AVX2-NEXT:    vpsubq %ymm10, %ymm4, %ymm4
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm10, %ymm10
-; AVX2-NEXT:    vpsubq %ymm10, %ymm5, %ymm5
-; AVX2-NEXT:    vpmovsxwq %xmm9, %ymm9
-; AVX2-NEXT:    vpsubq %ymm9, %ymm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm9
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm10, %ymm10
-; AVX2-NEXT:    vpsubq %ymm10, %ymm7, %ymm7
-; AVX2-NEXT:    vpmovsxbq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm9, %ymm9
-; AVX2-NEXT:    vpsubq %ymm9, %ymm8, %ymm8
-; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm6, %ymm1
-; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX2-NEXT:    vpsubq %ymm3, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm9, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm4, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm9, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm5, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm5, %ymm9, %ymm5, %ymm5
-; AVX2-NEXT:    vpsubq %ymm2, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm9, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm7, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm7, %ymm9, %ymm7, %ymm7
-; AVX2-NEXT:    vpsubq %ymm0, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm9, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubq %ymm8, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm8, %ymm9, %ymm8, %ymm8
-; AVX2-NEXT:    vpsubq %ymm1, %ymm6, %ymm6
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [255,255,255,255]
-; AVX2-NEXT:    vandpd %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vandpd %ymm6, %ymm8, %ymm8
-; AVX2-NEXT:    vpackusdw %ymm8, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm0, %ymm0
-; AVX2-NEXT:    vandpd %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpackusdw %ymm7, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm2, %ymm1
-; AVX2-NEXT:    vandpd %ymm6, %ymm5, %ymm2
-; AVX2-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm4, %ymm2
-; AVX2-NEXT:    vandpd %ymm6, %ymm3, %ymm3
-; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vpackuswb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v32i8_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm4
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm5
-; AVX512-NEXT:    vpmovsxwq %xmm5, %zmm6
-; AVX512-NEXT:    vpsubq %zmm6, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512-NEXT:    vpmovsxwq %xmm5, %zmm5
-; AVX512-NEXT:    vpsubq %zmm5, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm5
-; AVX512-NEXT:    vpsubq %zmm5, %zmm4, %zmm4
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm3, %zmm1
-; AVX512-NEXT:    vpabsq %zmm2, %zmm2
-; AVX512-NEXT:    vpabsq %zmm4, %zmm3
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = sext <32 x i8> %a to <32 x i64>
   %bext = sext <32 x i8> %b to <32 x i64>
@@ -614,139 +80,29 @@ define <32 x i8> @abd_ext_v32i8_undef(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <16 x i16> @abd_ext_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm6, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm7, %xmm7
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm8
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm5, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm6, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm7, %xmm7
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm8, %xmm8
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm11
-; AVX1-NEXT:    vpxor %xmm12, %xmm12, %xmm12
-; AVX1-NEXT:    vpsubq %xmm3, %xmm12, %xmm3
-; AVX1-NEXT:    vpsubq %xmm2, %xmm12, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm11, %ymm2, %ymm11, %ymm2
-; AVX1-NEXT:    vpsubq %xmm5, %xmm12, %xmm3
-; AVX1-NEXT:    vpsubq %xmm4, %xmm12, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm10, %ymm3, %ymm10, %ymm3
-; AVX1-NEXT:    vpsubq %xmm7, %xmm12, %xmm4
-; AVX1-NEXT:    vpsubq %xmm6, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm4, %ymm9, %ymm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm12, %xmm0
-; AVX1-NEXT:    vpsubq %xmm8, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535]
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm4, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
-; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm4, %ymm4
-; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm5, %ymm5
-; AVX2-NEXT:    vpsubq %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm5
-; AVX2-NEXT:    vpsubq %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm5, %ymm5
-; AVX2-NEXT:    vpsubq %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm4, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v16i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm2
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512-NEXT:    vpmovsxwq %xmm3, %zmm3
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = sext <16 x i16> %a to <16 x i64>
   %bext = sext <16 x i16> %b to <16 x i64>
@@ -759,139 +115,29 @@ define <16 x i16> @abd_ext_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 define <16 x i16> @abd_ext_v16i16_undef(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v16i16_undef:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm5, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm6, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm7, %xmm7
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm8
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm5, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm6, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovsxwq %xmm9, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm7, %xmm7
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm9
-; AVX1-NEXT:    vpsubq %xmm9, %xmm8, %xmm8
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovsxwq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm11
-; AVX1-NEXT:    vpxor %xmm12, %xmm12, %xmm12
-; AVX1-NEXT:    vpsubq %xmm3, %xmm12, %xmm3
-; AVX1-NEXT:    vpsubq %xmm2, %xmm12, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm11, %ymm2, %ymm11, %ymm2
-; AVX1-NEXT:    vpsubq %xmm5, %xmm12, %xmm3
-; AVX1-NEXT:    vpsubq %xmm4, %xmm12, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm10, %ymm3, %ymm10, %ymm3
-; AVX1-NEXT:    vpsubq %xmm7, %xmm12, %xmm4
-; AVX1-NEXT:    vpsubq %xmm6, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm4, %ymm9, %ymm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm12, %xmm0
-; AVX1-NEXT:    vpsubq %xmm8, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535]
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm4, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v16i16_undef:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm2, %ymm2
-; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm4, %ymm4
-; AVX2-NEXT:    vpmovsxwq %xmm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm5, %ymm5
-; AVX2-NEXT:    vpsubq %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm5
-; AVX2-NEXT:    vpsubq %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovsxwq %xmm5, %ymm5
-; AVX2-NEXT:    vpsubq %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpmovsxwq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm4, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v16i16_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm2
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512-NEXT:    vpmovsxwq %xmm3, %zmm3
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = sext <16 x i16> %a to <16 x i64>
   %bext = sext <16 x i16> %b to <16 x i64>
@@ -904,67 +150,29 @@ define <16 x i16> @abd_ext_v16i16_undef(<16 x i16> %a, <16 x i16> %b) nounwind {
 define <8 x i32> @abd_ext_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm6
-; AVX1-NEXT:    vpsubq %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm5
-; AVX1-NEXT:    vpsubq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm5
-; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubq %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpsubq %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm5, %ymm2
-; AVX1-NEXT:    vpsubq %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpsubq %xmm4, %xmm6, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
-; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v8i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
-; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = sext <8 x i32> %a to <8 x i64>
   %bext = sext <8 x i32> %b to <8 x i64>
@@ -977,67 +185,29 @@ define <8 x i32> @abd_ext_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 define <8 x i32> @abd_ext_v8i32_undef(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v8i32_undef:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm6
-; AVX1-NEXT:    vpsubq %xmm6, %xmm3, %xmm3
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm5, %xmm5
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm5
-; AVX1-NEXT:    vpsubq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm5
-; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubq %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpsubq %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm5, %ymm2
-; AVX1-NEXT:    vpsubq %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpsubq %xmm4, %xmm6, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v8i32_undef:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
-; AVX2-NEXT:    vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpmovsxdq %xmm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovsxdq %xmm1, %ymm1
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v8i32_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
-; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = sext <8 x i32> %a to <8 x i64>
   %bext = sext <8 x i32> %b to <8 x i64>
@@ -1188,71 +358,9 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; AVX512-LABEL: abd_ext_v4i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    vmovq %xmm0, %r11
-; AVX512-NEXT:    movq %r11, %r10
-; AVX512-NEXT:    sarq $63, %r10
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT:    movq %rcx, %r9
-; AVX512-NEXT:    sarq $63, %r9
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rdx
-; AVX512-NEXT:    movq %rdx, %r8
-; AVX512-NEXT:    sarq $63, %r8
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512-NEXT:    movq %rsi, %rdi
-; AVX512-NEXT:    sarq $63, %rdi
-; AVX512-NEXT:    vmovq %xmm1, %rbx
-; AVX512-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rbx
-; AVX512-NEXT:    vpextrq $1, %xmm1, %r14
-; AVX512-NEXT:    movq %r14, %r15
-; AVX512-NEXT:    sarq $63, %r15
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %r12
-; AVX512-NEXT:    movq %r12, %r13
-; AVX512-NEXT:    sarq $63, %r13
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512-NEXT:    movq %rax, %rbp
-; AVX512-NEXT:    sarq $63, %rbp
-; AVX512-NEXT:    subq %rax, %rsi
-; AVX512-NEXT:    sbbq %rbp, %rdi
-; AVX512-NEXT:    subq %r12, %rdx
-; AVX512-NEXT:    sbbq %r13, %r8
-; AVX512-NEXT:    subq %r14, %rcx
-; AVX512-NEXT:    sbbq %r15, %r9
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq %rbx, %r10
-; AVX512-NEXT:    sarq $63, %r10
-; AVX512-NEXT:    xorq %r10, %r11
-; AVX512-NEXT:    subq %r10, %r11
-; AVX512-NEXT:    sarq $63, %r9
-; AVX512-NEXT:    xorq %r9, %rcx
-; AVX512-NEXT:    subq %r9, %rcx
-; AVX512-NEXT:    sarq $63, %r8
-; AVX512-NEXT:    xorq %r8, %rdx
-; AVX512-NEXT:    subq %r8, %rdx
-; AVX512-NEXT:    sarq $63, %rdi
-; AVX512-NEXT:    xorq %rdi, %rsi
-; AVX512-NEXT:    subq %rdi, %rsi
-; AVX512-NEXT:    vmovq %rsi, %xmm0
-; AVX512-NEXT:    vmovq %rdx, %xmm1
-; AVX512-NEXT:    vmovq %rcx, %xmm2
-; AVX512-NEXT:    vmovq %r11, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vpminsq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = sext <4 x i64> %a to <4 x i128>
   %bext = sext <4 x i64> %b to <4 x i128>
@@ -1403,71 +511,9 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; AVX512-LABEL: abd_ext_v4i64_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    vmovq %xmm0, %r11
-; AVX512-NEXT:    movq %r11, %r10
-; AVX512-NEXT:    sarq $63, %r10
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT:    movq %rcx, %r9
-; AVX512-NEXT:    sarq $63, %r9
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rdx
-; AVX512-NEXT:    movq %rdx, %r8
-; AVX512-NEXT:    sarq $63, %r8
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512-NEXT:    movq %rsi, %rdi
-; AVX512-NEXT:    sarq $63, %rdi
-; AVX512-NEXT:    vmovq %xmm1, %rbx
-; AVX512-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rbx
-; AVX512-NEXT:    vpextrq $1, %xmm1, %r14
-; AVX512-NEXT:    movq %r14, %r15
-; AVX512-NEXT:    sarq $63, %r15
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %r12
-; AVX512-NEXT:    movq %r12, %r13
-; AVX512-NEXT:    sarq $63, %r13
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512-NEXT:    movq %rax, %rbp
-; AVX512-NEXT:    sarq $63, %rbp
-; AVX512-NEXT:    subq %rax, %rsi
-; AVX512-NEXT:    sbbq %rbp, %rdi
-; AVX512-NEXT:    subq %r12, %rdx
-; AVX512-NEXT:    sbbq %r13, %r8
-; AVX512-NEXT:    subq %r14, %rcx
-; AVX512-NEXT:    sbbq %r15, %r9
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq %rbx, %r10
-; AVX512-NEXT:    sarq $63, %r10
-; AVX512-NEXT:    xorq %r10, %r11
-; AVX512-NEXT:    subq %r10, %r11
-; AVX512-NEXT:    sarq $63, %r9
-; AVX512-NEXT:    xorq %r9, %rcx
-; AVX512-NEXT:    subq %r9, %rcx
-; AVX512-NEXT:    sarq $63, %r8
-; AVX512-NEXT:    xorq %r8, %rdx
-; AVX512-NEXT:    subq %r8, %rdx
-; AVX512-NEXT:    sarq $63, %rdi
-; AVX512-NEXT:    xorq %rdi, %rsi
-; AVX512-NEXT:    subq %rdi, %rsi
-; AVX512-NEXT:    vmovq %rsi, %xmm0
-; AVX512-NEXT:    vmovq %rdx, %xmm1
-; AVX512-NEXT:    vmovq %rcx, %xmm2
-; AVX512-NEXT:    vmovq %r11, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vpminsq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = sext <4 x i64> %a to <4 x i128>
   %bext = sext <4 x i64> %b to <4 x i128>
@@ -1484,15 +530,15 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <32 x i8> @abd_minmax_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: abd_minmax_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpminsb %xmm3, %xmm4, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm3
 ; AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxsb %xmm3, %xmm4, %xmm1
-; AVX1-NEXT:    vpsubb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v32i8:
@@ -1517,15 +563,15 @@ define <32 x i8> @abd_minmax_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <16 x i16> @abd_minmax_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: abd_minmax_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpminsw %xmm3, %xmm4, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm3
 ; AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxsw %xmm3, %xmm4, %xmm1
-; AVX1-NEXT:    vpsubw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v16i16:
@@ -1550,15 +596,15 @@ define <16 x i16> @abd_minmax_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 define <8 x i32> @abd_minmax_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: abd_minmax_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpminsd %xmm3, %xmm4, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
 ; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxsd %xmm3, %xmm4, %xmm1
-; AVX1-NEXT:    vpsubd %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v8i32:

diff  --git a/llvm/test/CodeGen/X86/abds-vector-512.ll b/llvm/test/CodeGen/X86/abds-vector-512.ll
index 0f6d37bcdda42..d19ff6edd78ba 100644
--- a/llvm/test/CodeGen/X86/abds-vector-512.ll
+++ b/llvm/test/CodeGen/X86/abds-vector-512.ll
@@ -7,78 +7,25 @@
 ;
 
 define <64 x i8> @abd_ext_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
-; AVX512-LABEL: abd_ext_v64i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512-NEXT:    vpmovsxbw %xmm4, %ymm4
-; AVX512-NEXT:    vpmovsxwq %xmm4, %zmm5
-; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512-NEXT:    vpmovsxwq %xmm4, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm6
-; AVX512-NEXT:    vpmovsxwq %xmm6, %zmm7
-; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512-NEXT:    vpmovsxwq %xmm6, %zmm6
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm8
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm10
-; AVX512-NEXT:    vpsubq %zmm10, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm9
-; AVX512-NEXT:    vpsubq %zmm9, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm9
-; AVX512-NEXT:    vpmovsxbw %xmm9, %ymm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm10
-; AVX512-NEXT:    vpsubq %zmm10, %zmm5, %zmm5
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm9
-; AVX512-NEXT:    vpsubq %zmm9, %zmm4, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm10
-; AVX512-NEXT:    vpsubq %zmm10, %zmm7, %zmm7
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm9
-; AVX512-NEXT:    vpsubq %zmm9, %zmm6, %zmm6
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm9
-; AVX512-NEXT:    vpsubq %zmm9, %zmm8, %zmm8
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm3, %zmm1
-; AVX512-NEXT:    vpabsq %zmm2, %zmm2
-; AVX512-NEXT:    vpabsq %zmm5, %zmm3
-; AVX512-NEXT:    vpabsq %zmm4, %zmm4
-; AVX512-NEXT:    vpabsq %zmm7, %zmm5
-; AVX512-NEXT:    vpabsq %zmm6, %zmm6
-; AVX512-NEXT:    vpabsq %zmm8, %zmm7
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0]
-; AVX512-NEXT:    vpmovqb %zmm6, %xmm6
-; AVX512-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm5, %ymm0
-; AVX512-NEXT:    vpmovqb %zmm4, %xmm4
-; AVX512-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: abd_ext_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: abd_ext_v64i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminsb %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminsb %ymm1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %aext = sext <64 x i8> %a to <64 x i64>
   %bext = sext <64 x i8> %b to <64 x i64>
   %sub = sub <64 x i64> %aext, %bext
@@ -88,78 +35,25 @@ define <64 x i8> @abd_ext_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 }
 
 define <64 x i8> @abd_ext_v64i8_undef(<64 x i8> %a, <64 x i8> %b) nounwind {
-; AVX512-LABEL: abd_ext_v64i8_undef:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512-NEXT:    vpmovsxbw %xmm4, %ymm4
-; AVX512-NEXT:    vpmovsxwq %xmm4, %zmm5
-; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512-NEXT:    vpmovsxwq %xmm4, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm6
-; AVX512-NEXT:    vpmovsxwq %xmm6, %zmm7
-; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512-NEXT:    vpmovsxwq %xmm6, %zmm6
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovsxbw %xmm0, %ymm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm8
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm10
-; AVX512-NEXT:    vpsubq %zmm10, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm9
-; AVX512-NEXT:    vpsubq %zmm9, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm9
-; AVX512-NEXT:    vpmovsxbw %xmm9, %ymm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm10
-; AVX512-NEXT:    vpsubq %zmm10, %zmm5, %zmm5
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm9
-; AVX512-NEXT:    vpsubq %zmm9, %zmm4, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm10
-; AVX512-NEXT:    vpsubq %zmm10, %zmm7, %zmm7
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovsxwq %xmm9, %zmm9
-; AVX512-NEXT:    vpsubq %zmm9, %zmm6, %zmm6
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovsxbw %xmm1, %ymm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm9
-; AVX512-NEXT:    vpsubq %zmm9, %zmm8, %zmm8
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm3, %zmm1
-; AVX512-NEXT:    vpabsq %zmm2, %zmm2
-; AVX512-NEXT:    vpabsq %zmm5, %zmm3
-; AVX512-NEXT:    vpabsq %zmm4, %zmm4
-; AVX512-NEXT:    vpabsq %zmm7, %zmm5
-; AVX512-NEXT:    vpabsq %zmm6, %zmm6
-; AVX512-NEXT:    vpabsq %zmm8, %zmm7
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0]
-; AVX512-NEXT:    vpmovqb %zmm6, %xmm6
-; AVX512-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm5, %ymm0
-; AVX512-NEXT:    vpmovqb %zmm4, %xmm4
-; AVX512-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: abd_ext_v64i8_undef:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: abd_ext_v64i8_undef:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminsb %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminsb %ymm1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %aext = sext <64 x i8> %a to <64 x i64>
   %bext = sext <64 x i8> %b to <64 x i64>
   %sub = sub <64 x i64> %aext, %bext
@@ -169,38 +63,25 @@ define <64 x i8> @abd_ext_v64i8_undef(<64 x i8> %a, <64 x i8> %b) nounwind {
 }
 
 define <32 x i16> @abd_ext_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
-; AVX512-LABEL: abd_ext_v32i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm2
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512-NEXT:    vpmovsxwq %xmm4, %zmm4
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512-NEXT:    vpmovsxwq %xmm5, %zmm5
-; AVX512-NEXT:    vpsubq %zmm5, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm5
-; AVX512-NEXT:    vpsubq %zmm5, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512-NEXT:    vpmovsxwq %xmm5, %zmm5
-; AVX512-NEXT:    vpsubq %zmm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm3, %zmm2
-; AVX512-NEXT:    vpabsq %zmm4, %zmm3
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpmovqw %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: abd_ext_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsubw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: abd_ext_v32i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminsw %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminsw %ymm1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %aext = sext <32 x i16> %a to <32 x i64>
   %bext = sext <32 x i16> %b to <32 x i64>
   %sub = sub <32 x i64> %aext, %bext
@@ -210,38 +91,25 @@ define <32 x i16> @abd_ext_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 }
 
 define <32 x i16> @abd_ext_v32i16_undef(<32 x i16> %a, <32 x i16> %b) nounwind {
-; AVX512-LABEL: abd_ext_v32i16_undef:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vpmovsxwq %xmm2, %zmm2
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512-NEXT:    vpmovsxwq %xmm4, %zmm4
-; AVX512-NEXT:    vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512-NEXT:    vpmovsxwq %xmm5, %zmm5
-; AVX512-NEXT:    vpsubq %zmm5, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm5
-; AVX512-NEXT:    vpsubq %zmm5, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512-NEXT:    vpmovsxwq %xmm5, %zmm5
-; AVX512-NEXT:    vpsubq %zmm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpmovsxwq %xmm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm3, %zmm2
-; AVX512-NEXT:    vpabsq %zmm4, %zmm3
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpmovqw %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: abd_ext_v32i16_undef:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsubw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: abd_ext_v32i16_undef:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminsw %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminsw %ymm1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %aext = sext <32 x i16> %a to <32 x i64>
   %bext = sext <32 x i16> %b to <32 x i64>
   %sub = sub <32 x i64> %aext, %bext
@@ -253,19 +121,9 @@ define <32 x i16> @abd_ext_v32i16_undef(<32 x i16> %a, <32 x i16> %b) nounwind {
 define <16 x i32> @abd_ext_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; AVX512-LABEL: abd_ext_v16i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512-NEXT:    vpmovsxdq %ymm2, %zmm2
-; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512-NEXT:    vpmovsxdq %ymm3, %zmm3
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubd %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %aext = sext <16 x i32> %a to <16 x i64>
   %bext = sext <16 x i32> %b to <16 x i64>
@@ -278,19 +136,9 @@ define <16 x i32> @abd_ext_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 define <16 x i32> @abd_ext_v16i32_undef(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; AVX512-LABEL: abd_ext_v16i32_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512-NEXT:    vpmovsxdq %ymm2, %zmm2
-; AVX512-NEXT:    vpmovsxdq %ymm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512-NEXT:    vpmovsxdq %ymm3, %zmm3
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovsxdq %ymm1, %zmm1
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubd %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %aext = sext <16 x i32> %a to <16 x i64>
   %bext = sext <16 x i32> %b to <16 x i64>
@@ -303,142 +151,9 @@ define <16 x i32> @abd_ext_v16i32_undef(<16 x i32> %a, <16 x i32> %b) nounwind {
 define <8 x i64> @abd_ext_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; AVX512-LABEL: abd_ext_v8i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vmovq %xmm0, %r9
-; AVX512-NEXT:    movq %r9, (%rsp) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r9
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rbp
-; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rbp
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, %r13
-; AVX512-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r13
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r12
-; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r12
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vmovq %xmm0, %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r15
-; AVX512-NEXT:    vpextrq $1, %xmm0, %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r14
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rbx
-; AVX512-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rbx
-; AVX512-NEXT:    vpextrq $1, %xmm0, %r11
-; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r11
-; AVX512-NEXT:    vmovq %xmm1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT:    vmovq %xmm0, %rdi
-; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rdi
-; AVX512-NEXT:    vpextrq $1, %xmm0, %r10
-; AVX512-NEXT:    movq %r10, %r8
-; AVX512-NEXT:    sarq $63, %r8
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rsi
-; AVX512-NEXT:    movq %rsi, %rax
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512-NEXT:    movq %rdx, %rcx
-; AVX512-NEXT:    sarq $63, %rcx
-; AVX512-NEXT:    subq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    sbbq %rcx, %r11
-; AVX512-NEXT:    subq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    sbbq %rax, %rbx
-; AVX512-NEXT:    subq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    sbbq %r8, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq %rdi, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; AVX512-NEXT:    movq (%rsp), %rax # 8-byte Reload
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT:    sarq $63, %r9
-; AVX512-NEXT:    xorq %r9, %rax
-; AVX512-NEXT:    subq %r9, %rax
-; AVX512-NEXT:    sarq $63, %rbp
-; AVX512-NEXT:    xorq %rbp, %rcx
-; AVX512-NEXT:    subq %rbp, %rcx
-; AVX512-NEXT:    sarq $63, %r13
-; AVX512-NEXT:    xorq %r13, %rdx
-; AVX512-NEXT:    subq %r13, %rdx
-; AVX512-NEXT:    sarq $63, %r12
-; AVX512-NEXT:    xorq %r12, %rsi
-; AVX512-NEXT:    subq %r12, %rsi
-; AVX512-NEXT:    sarq $63, %r15
-; AVX512-NEXT:    movq %r8, %rdi
-; AVX512-NEXT:    xorq %r15, %rdi
-; AVX512-NEXT:    subq %r15, %rdi
-; AVX512-NEXT:    sarq $63, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    xorq %r14, %r8
-; AVX512-NEXT:    subq %r14, %r8
-; AVX512-NEXT:    sarq $63, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    xorq %rbx, %r9
-; AVX512-NEXT:    subq %rbx, %r9
-; AVX512-NEXT:    sarq $63, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    xorq %r11, %r10
-; AVX512-NEXT:    subq %r11, %r10
-; AVX512-NEXT:    vmovq %r10, %xmm0
-; AVX512-NEXT:    vmovq %r9, %xmm1
-; AVX512-NEXT:    vmovq %r8, %xmm2
-; AVX512-NEXT:    vmovq %rdi, %xmm3
-; AVX512-NEXT:    vmovq %rsi, %xmm4
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vmovq %rcx, %xmm6
-; AVX512-NEXT:    vmovq %rax, %xmm7
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    addq $8, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %aext = sext <8 x i64> %a to <8 x i128>
   %bext = sext <8 x i64> %b to <8 x i128>
@@ -451,142 +166,9 @@ define <8 x i64> @abd_ext_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 define <8 x i64> @abd_ext_v8i64_undef(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; AVX512-LABEL: abd_ext_v8i64_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vmovq %xmm0, %r9
-; AVX512-NEXT:    movq %r9, (%rsp) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r9
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rbp
-; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rbp
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, %r13
-; AVX512-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r13
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r12
-; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r12
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vmovq %xmm0, %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r15
-; AVX512-NEXT:    vpextrq $1, %xmm0, %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r14
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rbx
-; AVX512-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rbx
-; AVX512-NEXT:    vpextrq $1, %xmm0, %r11
-; AVX512-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %r11
-; AVX512-NEXT:    vmovq %xmm1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT:    vmovq %xmm0, %rdi
-; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    sarq $63, %rdi
-; AVX512-NEXT:    vpextrq $1, %xmm0, %r10
-; AVX512-NEXT:    movq %r10, %r8
-; AVX512-NEXT:    sarq $63, %r8
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rsi
-; AVX512-NEXT:    movq %rsi, %rax
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512-NEXT:    movq %rdx, %rcx
-; AVX512-NEXT:    sarq $63, %rcx
-; AVX512-NEXT:    subq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    sbbq %rcx, %r11
-; AVX512-NEXT:    subq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    sbbq %rax, %rbx
-; AVX512-NEXT:    subq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    sbbq %r8, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq %rdi, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; AVX512-NEXT:    movq (%rsp), %rax # 8-byte Reload
-; AVX512-NEXT:    subq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX512-NEXT:    sbbq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT:    sarq $63, %r9
-; AVX512-NEXT:    xorq %r9, %rax
-; AVX512-NEXT:    subq %r9, %rax
-; AVX512-NEXT:    sarq $63, %rbp
-; AVX512-NEXT:    xorq %rbp, %rcx
-; AVX512-NEXT:    subq %rbp, %rcx
-; AVX512-NEXT:    sarq $63, %r13
-; AVX512-NEXT:    xorq %r13, %rdx
-; AVX512-NEXT:    subq %r13, %rdx
-; AVX512-NEXT:    sarq $63, %r12
-; AVX512-NEXT:    xorq %r12, %rsi
-; AVX512-NEXT:    subq %r12, %rsi
-; AVX512-NEXT:    sarq $63, %r15
-; AVX512-NEXT:    movq %r8, %rdi
-; AVX512-NEXT:    xorq %r15, %rdi
-; AVX512-NEXT:    subq %r15, %rdi
-; AVX512-NEXT:    sarq $63, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    xorq %r14, %r8
-; AVX512-NEXT:    subq %r14, %r8
-; AVX512-NEXT:    sarq $63, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    xorq %rbx, %r9
-; AVX512-NEXT:    subq %rbx, %r9
-; AVX512-NEXT:    sarq $63, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    xorq %r11, %r10
-; AVX512-NEXT:    subq %r11, %r10
-; AVX512-NEXT:    vmovq %r10, %xmm0
-; AVX512-NEXT:    vmovq %r9, %xmm1
-; AVX512-NEXT:    vmovq %r8, %xmm2
-; AVX512-NEXT:    vmovq %rdi, %xmm3
-; AVX512-NEXT:    vmovq %rsi, %xmm4
-; AVX512-NEXT:    vmovq %rdx, %xmm5
-; AVX512-NEXT:    vmovq %rcx, %xmm6
-; AVX512-NEXT:    vmovq %rax, %xmm7
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    addq $8, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %aext = sext <8 x i64> %a to <8 x i128>
   %bext = sext <8 x i64> %b to <8 x i128>
@@ -610,15 +192,15 @@ define <64 x i8> @abd_minmax_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512DQ-LABEL: abd_minmax_v64i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpminsb %ymm1, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512DQ-NEXT:    vpminsb %ymm3, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminsb %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminsb %ymm1, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmaxsb %ymm3, %ymm4, %ymm1
-; AVX512DQ-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
   %min = call <64 x i8> @llvm.smin.v64i8(<64 x i8> %a, <64 x i8> %b)
   %max = call <64 x i8> @llvm.smax.v64i8(<64 x i8> %a, <64 x i8> %b)
@@ -636,15 +218,15 @@ define <32 x i16> @abd_minmax_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ;
 ; AVX512DQ-LABEL: abd_minmax_v32i16:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpminsw %ymm1, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512DQ-NEXT:    vpminsw %ymm3, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminsw %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminsw %ymm1, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmaxsw %ymm3, %ymm4, %ymm1
-; AVX512DQ-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
   %min = call <32 x i16> @llvm.smin.v32i16(<32 x i16> %a, <32 x i16> %b)
   %max = call <32 x i16> @llvm.smax.v32i16(<32 x i16> %a, <32 x i16> %b)

diff  --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll
index efcdb0cec4c72..c445b20cf905d 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-128.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll
@@ -10,363 +10,20 @@
 ;
 
 define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
-; SSE2-LABEL: abd_ext_v16i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm9, %xmm9
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; SSE2-NEXT:    movdqa %xmm7, %xmm6
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm8
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm10, %xmm11
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE2-NEXT:    movdqa %xmm11, %xmm12
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
-; SSE2-NEXT:    psubq %xmm12, %xmm0
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE2-NEXT:    psubq %xmm11, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm10, %xmm11
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
-; SSE2-NEXT:    psubq %xmm11, %xmm5
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; SSE2-NEXT:    psubq %xmm10, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; SSE2-NEXT:    movdqa %xmm10, %xmm11
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
-; SSE2-NEXT:    psubq %xmm11, %xmm6
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; SSE2-NEXT:    psubq %xmm10, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; SSE2-NEXT:    psubq %xmm10, %xmm8
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
-; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm5
-; SSE2-NEXT:    psubq %xmm1, %xmm5
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    psubq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm6
-; SSE2-NEXT:    psubq %xmm1, %xmm6
-; SSE2-NEXT:    movdqa %xmm7, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm7
-; SSE2-NEXT:    psubq %xmm1, %xmm7
-; SSE2-NEXT:    movdqa %xmm8, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm8
-; SSE2-NEXT:    psubq %xmm1, %xmm8
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm1, %xmm8
-; SSE2-NEXT:    packuswb %xmm2, %xmm8
-; SSE2-NEXT:    pand %xmm1, %xmm7
-; SSE2-NEXT:    pand %xmm1, %xmm6
-; SSE2-NEXT:    packuswb %xmm7, %xmm6
-; SSE2-NEXT:    packuswb %xmm8, %xmm6
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm5
-; SSE2-NEXT:    packuswb %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    packuswb %xmm3, %xmm0
-; SSE2-NEXT:    packuswb %xmm5, %xmm0
-; SSE2-NEXT:    packuswb %xmm6, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: abd_ext_v16i8:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    psrld $16, %xmm2
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    movdqa %xmm0, %xmm4
-; SSE42-NEXT:    psrlq $48, %xmm4
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    movdqa %xmm0, %xmm6
-; SSE42-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrld $16, %xmm0
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm2
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm9
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrlq $48, %xmm0
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm4
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm6
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm7
-; SSE42-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm8
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm2, %xmm10
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm9, %xmm10
-; SSE42-NEXT:    movdqa %xmm9, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm9
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm3, %xmm10
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm3
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm5, %xmm10
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm5
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm4, %xmm10
-; SSE42-NEXT:    movdqa %xmm4, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm4
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm6, %xmm10
-; SSE42-NEXT:    movdqa %xmm6, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm6
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm7, %xmm10
-; SSE42-NEXT:    movdqa %xmm7, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm7
-; SSE42-NEXT:    psubq %xmm8, %xmm1
-; SSE42-NEXT:    movdqa %xmm8, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm8
-; SSE42-NEXT:    movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE42-NEXT:    andpd %xmm0, %xmm8
-; SSE42-NEXT:    andpd %xmm0, %xmm7
-; SSE42-NEXT:    packusdw %xmm8, %xmm7
-; SSE42-NEXT:    andpd %xmm0, %xmm6
-; SSE42-NEXT:    andpd %xmm0, %xmm4
-; SSE42-NEXT:    packusdw %xmm6, %xmm4
-; SSE42-NEXT:    packusdw %xmm7, %xmm4
-; SSE42-NEXT:    andpd %xmm0, %xmm5
-; SSE42-NEXT:    andpd %xmm0, %xmm3
-; SSE42-NEXT:    packusdw %xmm5, %xmm3
-; SSE42-NEXT:    andpd %xmm0, %xmm9
-; SSE42-NEXT:    andpd %xmm0, %xmm2
-; SSE42-NEXT:    packusdw %xmm2, %xmm9
-; SSE42-NEXT:    packusdw %xmm3, %xmm9
-; SSE42-NEXT:    packuswb %xmm4, %xmm9
-; SSE42-NEXT:    movdqa %xmm9, %xmm0
-; SSE42-NEXT:    retq
-;
-; AVX1-LABEL: abd_ext_v16i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm11, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm11, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm11, %xmm6, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm10, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm11, %xmm8, %xmm8
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm10, %xmm7, %xmm7
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm10, %xmm9, %xmm9
-; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm9, %ymm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm12
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm5
-; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm10, %ymm4, %ymm10, %ymm4
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vpsubq %xmm7, %xmm2, %xmm3
-; AVX1-NEXT:    vpsubq %xmm8, %xmm2, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm12, %ymm3, %ymm12, %ymm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm11, %ymm0, %ymm11, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX1-NEXT:    vandpd %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vandpd %ymm2, %ymm4, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vandpd %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm2, %ymm3, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm6, %ymm3, %ymm3
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubq %ymm4, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX2-NEXT:    vandpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vandpd %ymm2, %ymm4, %ymm4
-; AVX2-NEXT:    vpackusdw %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vandpd %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; SSE-LABEL: abd_ext_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pminub %xmm1, %xmm2
+; SSE-NEXT:    pmaxub %xmm1, %xmm0
+; SSE-NEXT:    psubb %xmm2, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX512-LABEL: abd_ext_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = zext <16 x i8> %a to <16 x i64>
   %bext = zext <16 x i8> %b to <16 x i64>
   %sub = sub <16 x i64> %aext, %bext
@@ -376,363 +33,20 @@ define <16 x i8> @abd_ext_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 }
 
 define <16 x i8> @abd_ext_v16i8_undef(<16 x i8> %a, <16 x i8> %b) nounwind {
-; SSE2-LABEL: abd_ext_v16i8_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm9, %xmm9
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm9[2],xmm3[3],xmm9[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm9[2],xmm4[3],xmm9[3]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
-; SSE2-NEXT:    movdqa %xmm2, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; SSE2-NEXT:    movdqa %xmm7, %xmm6
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm8
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm10, %xmm11
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE2-NEXT:    movdqa %xmm11, %xmm12
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
-; SSE2-NEXT:    psubq %xmm12, %xmm0
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE2-NEXT:    psubq %xmm11, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm10, %xmm11
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
-; SSE2-NEXT:    psubq %xmm11, %xmm5
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; SSE2-NEXT:    psubq %xmm10, %xmm4
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; SSE2-NEXT:    movdqa %xmm10, %xmm11
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
-; SSE2-NEXT:    psubq %xmm11, %xmm6
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
-; SSE2-NEXT:    psubq %xmm10, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm10
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
-; SSE2-NEXT:    psubq %xmm10, %xmm8
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
-; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    psubq %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm3
-; SSE2-NEXT:    psubq %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm5, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm5
-; SSE2-NEXT:    psubq %xmm1, %xmm5
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    psubq %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm6, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm6
-; SSE2-NEXT:    psubq %xmm1, %xmm6
-; SSE2-NEXT:    movdqa %xmm7, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm7
-; SSE2-NEXT:    psubq %xmm1, %xmm7
-; SSE2-NEXT:    movdqa %xmm8, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm8
-; SSE2-NEXT:    psubq %xmm1, %xmm8
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    psrad $31, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm1, %xmm2
-; SSE2-NEXT:    psubq %xmm1, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm1, %xmm8
-; SSE2-NEXT:    packuswb %xmm2, %xmm8
-; SSE2-NEXT:    pand %xmm1, %xmm7
-; SSE2-NEXT:    pand %xmm1, %xmm6
-; SSE2-NEXT:    packuswb %xmm7, %xmm6
-; SSE2-NEXT:    packuswb %xmm8, %xmm6
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm5
-; SSE2-NEXT:    packuswb %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    packuswb %xmm3, %xmm0
-; SSE2-NEXT:    packuswb %xmm5, %xmm0
-; SSE2-NEXT:    packuswb %xmm6, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: abd_ext_v16i8_undef:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    psrld $16, %xmm2
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    movdqa %xmm0, %xmm4
-; SSE42-NEXT:    psrlq $48, %xmm4
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    movdqa %xmm0, %xmm6
-; SSE42-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,zero,zero,zero,zero,xmm6[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,zero,zero,zero,zero,xmm7[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrld $16, %xmm0
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm2
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm9
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrlq $48, %xmm0
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm4
-; SSE42-NEXT:    movdqa %xmm1, %xmm0
-; SSE42-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm6
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm7
-; SSE42-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm8
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm2, %xmm10
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm2
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm9, %xmm10
-; SSE42-NEXT:    movdqa %xmm9, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm9
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm3, %xmm10
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm3
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm5, %xmm10
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm5
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm4, %xmm10
-; SSE42-NEXT:    movdqa %xmm4, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm4
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm6, %xmm10
-; SSE42-NEXT:    movdqa %xmm6, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm6
-; SSE42-NEXT:    pxor %xmm10, %xmm10
-; SSE42-NEXT:    psubq %xmm7, %xmm10
-; SSE42-NEXT:    movdqa %xmm7, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm10, %xmm7
-; SSE42-NEXT:    psubq %xmm8, %xmm1
-; SSE42-NEXT:    movdqa %xmm8, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm8
-; SSE42-NEXT:    movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE42-NEXT:    andpd %xmm0, %xmm8
-; SSE42-NEXT:    andpd %xmm0, %xmm7
-; SSE42-NEXT:    packusdw %xmm8, %xmm7
-; SSE42-NEXT:    andpd %xmm0, %xmm6
-; SSE42-NEXT:    andpd %xmm0, %xmm4
-; SSE42-NEXT:    packusdw %xmm6, %xmm4
-; SSE42-NEXT:    packusdw %xmm7, %xmm4
-; SSE42-NEXT:    andpd %xmm0, %xmm5
-; SSE42-NEXT:    andpd %xmm0, %xmm3
-; SSE42-NEXT:    packusdw %xmm5, %xmm3
-; SSE42-NEXT:    andpd %xmm0, %xmm9
-; SSE42-NEXT:    andpd %xmm0, %xmm2
-; SSE42-NEXT:    packusdw %xmm2, %xmm9
-; SSE42-NEXT:    packusdw %xmm3, %xmm9
-; SSE42-NEXT:    packuswb %xmm4, %xmm9
-; SSE42-NEXT:    movdqa %xmm9, %xmm0
-; SSE42-NEXT:    retq
-;
-; AVX1-LABEL: abd_ext_v16i8_undef:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm3[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm10 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm11, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm11, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm11, %xmm6, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm10, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm11, %xmm8, %xmm8
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm10, %xmm7, %xmm7
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm10, %xmm9, %xmm9
-; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm6, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm9, %ymm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm8, %ymm12
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm5
-; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm10, %ymm4, %ymm10, %ymm4
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vpsubq %xmm7, %xmm2, %xmm3
-; AVX1-NEXT:    vpsubq %xmm8, %xmm2, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm12, %ymm3, %ymm12, %ymm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm11, %ymm0, %ymm11, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX1-NEXT:    vandpd %ymm2, %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vandpd %ymm2, %ymm4, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vandpd %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm2, %ymm3, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v16i8_undef:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm6, %ymm3, %ymm3
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm0, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubq %ymm4, %ymm2, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX2-NEXT:    vandpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vandpd %ymm2, %ymm4, %ymm4
-; AVX2-NEXT:    vpackusdw %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vandpd %ymm2, %ymm3, %ymm2
-; AVX2-NEXT:    vpackusdw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; SSE-LABEL: abd_ext_v16i8_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pminub %xmm1, %xmm2
+; SSE-NEXT:    pmaxub %xmm1, %xmm0
+; SSE-NEXT:    psubb %xmm2, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX512-LABEL: abd_ext_v16i8_undef:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v16i8_undef:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = zext <16 x i8> %a to <16 x i64>
   %bext = zext <16 x i8> %b to <16 x i64>
   %sub = sub <16 x i64> %aext, %bext
@@ -802,126 +116,18 @@ define <8 x i16> @abd_ext_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE42-LABEL: abd_ext_v8i16:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm2
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm4
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm2, %xmm6
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm5, %xmm6
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm3, %xmm6
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm4, %xmm6
-; SSE42-NEXT:    movdqa %xmm4, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm4
-; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7]
-; SSE42-NEXT:    packusdw %xmm4, %xmm3
-; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7]
-; SSE42-NEXT:    packusdw %xmm2, %xmm5
-; SSE42-NEXT:    packusdw %xmm3, %xmm5
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pminuw %xmm1, %xmm2
+; SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE42-NEXT:    psubw %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: abd_ext_v8i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm5
-; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpsubq %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm5, %ymm2
-; AVX1-NEXT:    vpsubq %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpsubq %xmm4, %xmm6, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535]
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v8i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: abd_ext_v8i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = zext <8 x i16> %a to <8 x i64>
   %bext = zext <8 x i16> %b to <8 x i64>
   %sub = sub <8 x i64> %aext, %bext
@@ -991,126 +197,18 @@ define <8 x i16> @abd_ext_v8i16_undef(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE42-LABEL: abd_ext_v8i16_undef:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm2
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm5
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
-; SSE42-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE42-NEXT:    psubq %xmm0, %xmm4
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm2, %xmm6
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm5, %xmm6
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm3, %xmm6
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
-; SSE42-NEXT:    pxor %xmm6, %xmm6
-; SSE42-NEXT:    psubq %xmm4, %xmm6
-; SSE42-NEXT:    movdqa %xmm4, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm4
-; SSE42-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7]
-; SSE42-NEXT:    packusdw %xmm4, %xmm3
-; SSE42-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7]
-; SSE42-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7]
-; SSE42-NEXT:    packusdw %xmm2, %xmm5
-; SSE42-NEXT:    packusdw %xmm3, %xmm5
-; SSE42-NEXT:    movdqa %xmm5, %xmm0
+; SSE42-NEXT:    movdqa %xmm0, %xmm2
+; SSE42-NEXT:    pminuw %xmm1, %xmm2
+; SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; SSE42-NEXT:    psubw %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: abd_ext_v8i16_undef:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm5
-; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX1-NEXT:    vpsubq %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpsubq %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm5, %ymm2
-; AVX1-NEXT:    vpsubq %xmm0, %xmm6, %xmm0
-; AVX1-NEXT:    vpsubq %xmm4, %xmm6, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535]
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v8i16_undef:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: abd_ext_v8i16_undef:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v8i16_undef:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = zext <8 x i16> %a to <8 x i64>
   %bext = zext <8 x i16> %b to <8 x i64>
   %sub = sub <8 x i64> %aext, %bext
@@ -1147,65 +245,17 @@ define <4 x i32> @abd_ext_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE42-LABEL: abd_ext_v4i32:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    pxor %xmm4, %xmm4
-; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; SSE42-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
-; SSE42-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE42-NEXT:    psubq %xmm1, %xmm2
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    psubq %xmm2, %xmm1
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; SSE42-NEXT:    psubq %xmm3, %xmm4
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm3
-; SSE42-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
-; SSE42-NEXT:    movaps %xmm3, %xmm0
+; SSE42-NEXT:    pminud %xmm1, %xmm2
+; SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; SSE42-NEXT:    psubd %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: abd_ext_v4i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm1
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v4i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: abd_ext_v4i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpabsq %ymm0, %ymm0
-; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = zext <4 x i32> %a to <4 x i64>
   %bext = zext <4 x i32> %b to <4 x i64>
   %sub = sub <4 x i64> %aext, %bext
@@ -1242,65 +292,17 @@ define <4 x i32> @abd_ext_v4i32_undef(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; SSE42-LABEL: abd_ext_v4i32_undef:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
-; SSE42-NEXT:    pxor %xmm4, %xmm4
-; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; SSE42-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
-; SSE42-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE42-NEXT:    psubq %xmm1, %xmm2
-; SSE42-NEXT:    psubq %xmm0, %xmm3
-; SSE42-NEXT:    pxor %xmm1, %xmm1
-; SSE42-NEXT:    psubq %xmm2, %xmm1
-; SSE42-NEXT:    movdqa %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; SSE42-NEXT:    psubq %xmm3, %xmm4
-; SSE42-NEXT:    movdqa %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm3
-; SSE42-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
-; SSE42-NEXT:    movaps %xmm3, %xmm0
+; SSE42-NEXT:    pminud %xmm1, %xmm2
+; SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; SSE42-NEXT:    psubd %xmm2, %xmm0
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: abd_ext_v4i32_undef:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm1
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: abd_ext_v4i32_undef:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: abd_ext_v4i32_undef:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpabsq %ymm0, %ymm0
-; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: abd_ext_v4i32_undef:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm2
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %aext = zext <4 x i32> %a to <4 x i64>
   %bext = zext <4 x i32> %b to <4 x i64>
   %sub = sub <4 x i64> %aext, %bext
@@ -1358,28 +360,58 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE42-NEXT:    retq
 ;
-; AVX-LABEL: abd_ext_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX-NEXT:    vmovq %xmm1, %rdx
-; AVX-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX-NEXT:    xorl %edi, %edi
-; AVX-NEXT:    subq %rsi, %rcx
-; AVX-NEXT:    movl $0, %esi
-; AVX-NEXT:    sbbq %rsi, %rsi
-; AVX-NEXT:    subq %rdx, %rax
-; AVX-NEXT:    sbbq %rdi, %rdi
-; AVX-NEXT:    sarq $63, %rdi
-; AVX-NEXT:    xorq %rdi, %rax
-; AVX-NEXT:    subq %rdi, %rax
-; AVX-NEXT:    sarq $63, %rsi
-; AVX-NEXT:    xorq %rsi, %rcx
-; AVX-NEXT:    subq %rsi, %rcx
-; AVX-NEXT:    vmovq %rcx, %xmm0
-; AVX-NEXT:    vmovq %rax, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: abd_ext_v2i64:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT:    vmovq %xmm1, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX1-NEXT:    xorl %edi, %edi
+; AVX1-NEXT:    subq %rsi, %rcx
+; AVX1-NEXT:    movl $0, %esi
+; AVX1-NEXT:    sbbq %rsi, %rsi
+; AVX1-NEXT:    subq %rdx, %rax
+; AVX1-NEXT:    sbbq %rdi, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    xorq %rdi, %rax
+; AVX1-NEXT:    subq %rdi, %rax
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    xorq %rsi, %rcx
+; AVX1-NEXT:    subq %rsi, %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: abd_ext_v2i64:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    subq %rsi, %rcx
+; AVX2-NEXT:    movl $0, %esi
+; AVX2-NEXT:    sbbq %rsi, %rsi
+; AVX2-NEXT:    subq %rdx, %rax
+; AVX2-NEXT:    sbbq %rdi, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    xorq %rdi, %rax
+; AVX2-NEXT:    subq %rdi, %rax
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    xorq %rsi, %rcx
+; AVX2-NEXT:    subq %rsi, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: abd_ext_v2i64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminuq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %aext = zext <2 x i64> %a to <2 x i128>
   %bext = zext <2 x i64> %b to <2 x i128>
   %sub = sub <2 x i128> %aext, %bext
@@ -1437,28 +469,58 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE42-NEXT:    retq
 ;
-; AVX-LABEL: abd_ext_v2i64_undef:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX-NEXT:    vmovq %xmm1, %rdx
-; AVX-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX-NEXT:    xorl %edi, %edi
-; AVX-NEXT:    subq %rsi, %rcx
-; AVX-NEXT:    movl $0, %esi
-; AVX-NEXT:    sbbq %rsi, %rsi
-; AVX-NEXT:    subq %rdx, %rax
-; AVX-NEXT:    sbbq %rdi, %rdi
-; AVX-NEXT:    sarq $63, %rdi
-; AVX-NEXT:    xorq %rdi, %rax
-; AVX-NEXT:    subq %rdi, %rax
-; AVX-NEXT:    sarq $63, %rsi
-; AVX-NEXT:    xorq %rsi, %rcx
-; AVX-NEXT:    subq %rsi, %rcx
-; AVX-NEXT:    vmovq %rcx, %xmm0
-; AVX-NEXT:    vmovq %rax, %xmm1
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    retq
+; AVX1-LABEL: abd_ext_v2i64_undef:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT:    vmovq %xmm1, %rdx
+; AVX1-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX1-NEXT:    xorl %edi, %edi
+; AVX1-NEXT:    subq %rsi, %rcx
+; AVX1-NEXT:    movl $0, %esi
+; AVX1-NEXT:    sbbq %rsi, %rsi
+; AVX1-NEXT:    subq %rdx, %rax
+; AVX1-NEXT:    sbbq %rdi, %rdi
+; AVX1-NEXT:    sarq $63, %rdi
+; AVX1-NEXT:    xorq %rdi, %rax
+; AVX1-NEXT:    subq %rdi, %rax
+; AVX1-NEXT:    sarq $63, %rsi
+; AVX1-NEXT:    xorq %rsi, %rcx
+; AVX1-NEXT:    subq %rsi, %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm0
+; AVX1-NEXT:    vmovq %rax, %xmm1
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: abd_ext_v2i64_undef:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    subq %rsi, %rcx
+; AVX2-NEXT:    movl $0, %esi
+; AVX2-NEXT:    sbbq %rsi, %rsi
+; AVX2-NEXT:    subq %rdx, %rax
+; AVX2-NEXT:    sbbq %rdi, %rdi
+; AVX2-NEXT:    sarq $63, %rdi
+; AVX2-NEXT:    xorq %rdi, %rax
+; AVX2-NEXT:    subq %rdi, %rax
+; AVX2-NEXT:    sarq $63, %rsi
+; AVX2-NEXT:    xorq %rsi, %rcx
+; AVX2-NEXT:    subq %rsi, %rcx
+; AVX2-NEXT:    vmovq %rcx, %xmm0
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: abd_ext_v2i64_undef:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpminuq %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %aext = zext <2 x i64> %a to <2 x i128>
   %bext = zext <2 x i64> %b to <2 x i128>
   %sub = sub <2 x i128> %aext, %bext

diff  --git a/llvm/test/CodeGen/X86/abdu-vector-256.ll b/llvm/test/CodeGen/X86/abdu-vector-256.ll
index fa9ccd069570e..52a678d21b0aa 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-256.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-256.ll
@@ -10,295 +10,29 @@
 define <32 x i8> @abd_ext_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    pushq %rax
-; AVX1-NEXT:    vmovdqa %ymm0, %ymm14
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm3
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm13, %xmm5, %xmm5
-; AVX1-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm9, %xmm9
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm12, %xmm5
-; AVX1-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm12, %xmm7, %xmm7
-; AVX1-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm12 = xmm5[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm12, %xmm11, %xmm7
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm11, %xmm6, %xmm6
-; AVX1-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm10, %xmm12
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm5[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm10, %xmm11, %xmm11
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm10, %xmm5, %xmm10
-; AVX1-NEXT:    vpsrld $16, %xmm14, %xmm5
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm15
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm14 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm6, %xmm14, %xmm14
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm13
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm1
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm6
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm9, %ymm6
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm6, %ymm1, %ymm0
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm6
-; AVX1-NEXT:    vpsubq %xmm8, %xmm2, %xmm8
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm8, %ymm6, %ymm8
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm6
-; AVX1-NEXT:    vpsubq %xmm7, %xmm2, %xmm9
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm7, %ymm7
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm7, %ymm6, %ymm7
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm1, %ymm6
-; AVX1-NEXT:    vpsubq %xmm12, %xmm2, %xmm9
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm12
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm12, %ymm9
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm9, %ymm6, %ymm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm6
-; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm4
-; AVX1-NEXT:    vpsubq %xmm11, %xmm2, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm4, %ymm6, %ymm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm15, %ymm10, %ymm6
-; AVX1-NEXT:    vpsubq %xmm15, %xmm2, %xmm11
-; AVX1-NEXT:    vpsubq %xmm10, %xmm2, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm10, %ymm10
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm10, %ymm6, %ymm6
-; AVX1-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm10
-; AVX1-NEXT:    vpsubq %xmm13, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubq %xmm14, %xmm2, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm11, %ymm1
-; AVX1-NEXT:    vblendvpd %ymm10, %ymm1, %ymm10, %ymm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm1
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm1, %ymm2
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [255,255,255,255]
-; AVX1-NEXT:    vandpd %ymm1, %ymm9, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm7, %ymm5
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm7
-; AVX1-NEXT:    vpackusdw %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm1, %ymm10, %ymm5
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm7
-; AVX1-NEXT:    vpackusdw %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm1, %ymm8, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm6, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm4, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    popq %rax
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v32i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm10, %ymm3, %ymm3
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm10, %ymm4, %ymm4
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm10, %ymm5, %ymm5
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm9, %ymm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm10, %ymm7, %ymm7
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm9, %ymm8, %ymm8
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm6, %ymm1
-; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX2-NEXT:    vpsubq %ymm3, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm9, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm4, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm9, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm5, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm5, %ymm9, %ymm5, %ymm5
-; AVX2-NEXT:    vpsubq %ymm2, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm9, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm7, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm7, %ymm9, %ymm7, %ymm7
-; AVX2-NEXT:    vpsubq %ymm0, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm9, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubq %ymm8, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm8, %ymm9, %ymm8, %ymm8
-; AVX2-NEXT:    vpsubq %ymm1, %ymm6, %ymm6
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [255,255,255,255]
-; AVX2-NEXT:    vandpd %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vandpd %ymm6, %ymm8, %ymm8
-; AVX2-NEXT:    vpackusdw %ymm8, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm0, %ymm0
-; AVX2-NEXT:    vandpd %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpackusdw %ymm7, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm2, %ymm1
-; AVX2-NEXT:    vandpd %ymm6, %ymm5, %ymm2
-; AVX2-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm4, %ymm2
-; AVX2-NEXT:    vandpd %ymm6, %ymm3, %ymm3
-; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vpackuswb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v32i8:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm6, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm5, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm5, %zmm4, %zmm4
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm3, %zmm1
-; AVX512-NEXT:    vpabsq %zmm2, %zmm2
-; AVX512-NEXT:    vpabsq %zmm4, %zmm3
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = zext <32 x i8> %a to <32 x i64>
   %bext = zext <32 x i8> %b to <32 x i64>
@@ -311,295 +45,29 @@ define <32 x i8> @abd_ext_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <32 x i8> @abd_ext_v32i8_undef(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v32i8_undef:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    pushq %rax
-; AVX1-NEXT:    vmovdqa %ymm0, %ymm14
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm14, %xmm3
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm13, %xmm5, %xmm5
-; AVX1-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm9, %xmm9
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm12, %xmm5
-; AVX1-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm0, %xmm8, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm12 = xmm5[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm12, %xmm7, %xmm7
-; AVX1-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm12 = xmm5[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm12, %xmm11, %xmm7
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm11, %xmm6, %xmm6
-; AVX1-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm11 = xmm4[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm10, %xmm12
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm10 = xmm5[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm10, %xmm11, %xmm11
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,zero,zero,zero,zero,xmm14[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm10, %xmm5, %xmm10
-; AVX1-NEXT:    vpsrld $16, %xmm14, %xmm5
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm5, %xmm15
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm14 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm6, %xmm14, %xmm14
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm5, %xmm1, %xmm13
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm5 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm6, %xmm5, %xmm5
-; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm3
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm0, %ymm1
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm6
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm9, %ymm6
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm6, %ymm1, %ymm0
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm6
-; AVX1-NEXT:    vpsubq %xmm8, %xmm2, %xmm8
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm8, %ymm6, %ymm8
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm6
-; AVX1-NEXT:    vpsubq %xmm7, %xmm2, %xmm9
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm7
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm7, %ymm7
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm7, %ymm6, %ymm7
-; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX1-NEXT:    vinsertf128 $1, %xmm12, %ymm1, %ymm6
-; AVX1-NEXT:    vpsubq %xmm12, %xmm2, %xmm9
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm12
-; AVX1-NEXT:    vinsertf128 $1, %xmm9, %ymm12, %ymm9
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm9, %ymm6, %ymm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm6
-; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm4
-; AVX1-NEXT:    vpsubq %xmm11, %xmm2, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm4, %ymm6, %ymm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm15, %ymm10, %ymm6
-; AVX1-NEXT:    vpsubq %xmm15, %xmm2, %xmm11
-; AVX1-NEXT:    vpsubq %xmm10, %xmm2, %xmm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm11, %ymm10, %ymm10
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm10, %ymm6, %ymm6
-; AVX1-NEXT:    vinsertf128 $1, %xmm13, %ymm14, %ymm10
-; AVX1-NEXT:    vpsubq %xmm13, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubq %xmm14, %xmm2, %xmm11
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm11, %ymm1
-; AVX1-NEXT:    vblendvpd %ymm10, %ymm1, %ymm10, %ymm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm1
-; AVX1-NEXT:    vpsubq %xmm3, %xmm2, %xmm3
-; AVX1-NEXT:    vpsubq %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm1, %ymm2
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [255,255,255,255]
-; AVX1-NEXT:    vandpd %ymm1, %ymm9, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm7, %ymm5
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm7
-; AVX1-NEXT:    vpackusdw %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm1, %ymm10, %ymm5
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm7
-; AVX1-NEXT:    vpackusdw %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandpd %ymm1, %ymm8, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm6, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm4, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    popq %rax
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v32i8_undef:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm8 = xmm6[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm10, %ymm3, %ymm3
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm10 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm10, %ymm4, %ymm4
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm10, %ymm5, %ymm5
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm9, %ymm2, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm10, %ymm7, %ymm7
-; AVX2-NEXT:    vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm9, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm9, %ymm8, %ymm8
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm6, %ymm1
-; AVX2-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX2-NEXT:    vpsubq %ymm3, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm9, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm4, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm9, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm5, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm5, %ymm9, %ymm5, %ymm5
-; AVX2-NEXT:    vpsubq %ymm2, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm9, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm7, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm7, %ymm9, %ymm7, %ymm7
-; AVX2-NEXT:    vpsubq %ymm0, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm9, %ymm0, %ymm0
-; AVX2-NEXT:    vpsubq %ymm8, %ymm6, %ymm9
-; AVX2-NEXT:    vblendvpd %ymm8, %ymm9, %ymm8, %ymm8
-; AVX2-NEXT:    vpsubq %ymm1, %ymm6, %ymm6
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [255,255,255,255]
-; AVX2-NEXT:    vandpd %ymm6, %ymm1, %ymm1
-; AVX2-NEXT:    vandpd %ymm6, %ymm8, %ymm8
-; AVX2-NEXT:    vpackusdw %ymm8, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm0, %ymm0
-; AVX2-NEXT:    vandpd %ymm6, %ymm7, %ymm7
-; AVX2-NEXT:    vpackusdw %ymm7, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm2, %ymm1
-; AVX2-NEXT:    vandpd %ymm6, %ymm5, %ymm2
-; AVX2-NEXT:    vpackusdw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vandpd %ymm6, %ymm4, %ymm2
-; AVX2-NEXT:    vandpd %ymm6, %ymm3, %ymm3
-; AVX2-NEXT:    vpackusdw %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vpackuswb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v32i8_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm6, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm5, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm5, %zmm4, %zmm4
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm3, %zmm1
-; AVX512-NEXT:    vpabsq %zmm2, %zmm2
-; AVX512-NEXT:    vpabsq %zmm4, %zmm3
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = zext <32 x i8> %a to <32 x i64>
   %bext = zext <32 x i8> %b to <32 x i64>
@@ -612,139 +80,29 @@ define <32 x i8> @abd_ext_v32i8_undef(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <16 x i16> @abd_ext_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm5, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm6, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm7, %xmm7
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm8, %xmm8
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm11
-; AVX1-NEXT:    vpxor %xmm12, %xmm12, %xmm12
-; AVX1-NEXT:    vpsubq %xmm3, %xmm12, %xmm3
-; AVX1-NEXT:    vpsubq %xmm2, %xmm12, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm11, %ymm2, %ymm11, %ymm2
-; AVX1-NEXT:    vpsubq %xmm5, %xmm12, %xmm3
-; AVX1-NEXT:    vpsubq %xmm4, %xmm12, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm10, %ymm3, %ymm10, %ymm3
-; AVX1-NEXT:    vpsubq %xmm7, %xmm12, %xmm4
-; AVX1-NEXT:    vpsubq %xmm6, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm4, %ymm9, %ymm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm12, %xmm0
-; AVX1-NEXT:    vpsubq %xmm8, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535]
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm4, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v16i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm4, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v16i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = zext <16 x i16> %a to <16 x i64>
   %bext = zext <16 x i16> %b to <16 x i64>
@@ -757,139 +115,29 @@ define <16 x i16> @abd_ext_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 define <16 x i16> @abd_ext_v16i16_undef(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v16i16_undef:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm3, %xmm3
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm4, %xmm4
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm5, %xmm5
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[2,3,2,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm6, %xmm6
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm9 = xmm1[3,3,3,3]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm7, %xmm7
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm9, %xmm8, %xmm8
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm8, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm9
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm10
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm11
-; AVX1-NEXT:    vpxor %xmm12, %xmm12, %xmm12
-; AVX1-NEXT:    vpsubq %xmm3, %xmm12, %xmm3
-; AVX1-NEXT:    vpsubq %xmm2, %xmm12, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm11, %ymm2, %ymm11, %ymm2
-; AVX1-NEXT:    vpsubq %xmm5, %xmm12, %xmm3
-; AVX1-NEXT:    vpsubq %xmm4, %xmm12, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm10, %ymm3, %ymm10, %ymm3
-; AVX1-NEXT:    vpsubq %xmm7, %xmm12, %xmm4
-; AVX1-NEXT:    vpsubq %xmm6, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT:    vblendvpd %ymm9, %ymm4, %ymm9, %ymm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm12, %xmm0
-; AVX1-NEXT:    vpsubq %xmm8, %xmm12, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [65535,65535,65535,65535]
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm4, %ymm4
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
-; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vandpd %ymm1, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vandpd %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v16i16_undef:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm5, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vpsubq %ymm4, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm5
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm0, %ymm0
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15]
-; AVX2-NEXT:    vpackusdw %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT:    vpackusdw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v16i16_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = zext <16 x i16> %a to <16 x i64>
   %bext = zext <16 x i16> %b to <16 x i64>
@@ -902,63 +150,29 @@ define <16 x i16> @abd_ext_v16i16_undef(<16 x i16> %a, <16 x i16> %b) nounwind {
 define <8 x i32> @abd_ext_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX1-NEXT:    vpsubq %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT:    vpsubq %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
-; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm6
-; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm2, %ymm6, %ymm2
-; AVX1-NEXT:    vpsubq %xmm5, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v8i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v8i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = zext <8 x i32> %a to <8 x i64>
   %bext = zext <8 x i32> %b to <8 x i64>
@@ -971,63 +185,29 @@ define <8 x i32> @abd_ext_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 define <8 x i32> @abd_ext_v8i32_undef(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: abd_ext_v8i32_undef:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX1-NEXT:    vpsubq %xmm7, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT:    vpsubq %xmm7, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero
-; AVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm6
-; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm2, %ymm6, %ymm2
-; AVX1-NEXT:    vpsubq %xmm5, %xmm3, %xmm4
-; AVX1-NEXT:    vpsubq %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_ext_v8i32_undef:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX2-NEXT:    vpsubq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm0[2,3],ymm2[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: abd_ext_v8i32_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = zext <8 x i32> %a to <8 x i64>
   %bext = zext <8 x i32> %b to <8 x i64>
@@ -1130,47 +310,9 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; AVX512-LABEL: abd_ext_v4i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rdx
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512-NEXT:    vmovq %xmm1, %r8
-; AVX512-NEXT:    vpextrq $1, %xmm1, %r9
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %r10
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rdi
-; AVX512-NEXT:    xorl %r11d, %r11d
-; AVX512-NEXT:    subq %rdi, %rsi
-; AVX512-NEXT:    movl $0, %edi
-; AVX512-NEXT:    sbbq %rdi, %rdi
-; AVX512-NEXT:    subq %r10, %rdx
-; AVX512-NEXT:    movl $0, %r10d
-; AVX512-NEXT:    sbbq %r10, %r10
-; AVX512-NEXT:    subq %r9, %rcx
-; AVX512-NEXT:    movl $0, %r9d
-; AVX512-NEXT:    sbbq %r9, %r9
-; AVX512-NEXT:    subq %r8, %rax
-; AVX512-NEXT:    sbbq %r11, %r11
-; AVX512-NEXT:    sarq $63, %r11
-; AVX512-NEXT:    xorq %r11, %rax
-; AVX512-NEXT:    subq %r11, %rax
-; AVX512-NEXT:    sarq $63, %r9
-; AVX512-NEXT:    xorq %r9, %rcx
-; AVX512-NEXT:    subq %r9, %rcx
-; AVX512-NEXT:    sarq $63, %r10
-; AVX512-NEXT:    xorq %r10, %rdx
-; AVX512-NEXT:    subq %r10, %rdx
-; AVX512-NEXT:    sarq $63, %rdi
-; AVX512-NEXT:    xorq %rdi, %rsi
-; AVX512-NEXT:    subq %rdi, %rsi
-; AVX512-NEXT:    vmovq %rsi, %xmm0
-; AVX512-NEXT:    vmovq %rdx, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vmovq %rcx, %xmm1
-; AVX512-NEXT:    vmovq %rax, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpminuq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = zext <4 x i64> %a to <4 x i128>
   %bext = zext <4 x i64> %b to <4 x i128>
@@ -1273,47 +415,9 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
 ;
 ; AVX512-LABEL: abd_ext_v4i64_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rdx
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512-NEXT:    vmovq %xmm1, %r8
-; AVX512-NEXT:    vpextrq $1, %xmm1, %r9
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %r10
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rdi
-; AVX512-NEXT:    xorl %r11d, %r11d
-; AVX512-NEXT:    subq %rdi, %rsi
-; AVX512-NEXT:    movl $0, %edi
-; AVX512-NEXT:    sbbq %rdi, %rdi
-; AVX512-NEXT:    subq %r10, %rdx
-; AVX512-NEXT:    movl $0, %r10d
-; AVX512-NEXT:    sbbq %r10, %r10
-; AVX512-NEXT:    subq %r9, %rcx
-; AVX512-NEXT:    movl $0, %r9d
-; AVX512-NEXT:    sbbq %r9, %r9
-; AVX512-NEXT:    subq %r8, %rax
-; AVX512-NEXT:    sbbq %r11, %r11
-; AVX512-NEXT:    sarq $63, %r11
-; AVX512-NEXT:    xorq %r11, %rax
-; AVX512-NEXT:    subq %r11, %rax
-; AVX512-NEXT:    sarq $63, %r9
-; AVX512-NEXT:    xorq %r9, %rcx
-; AVX512-NEXT:    subq %r9, %rcx
-; AVX512-NEXT:    sarq $63, %r10
-; AVX512-NEXT:    xorq %r10, %rdx
-; AVX512-NEXT:    subq %r10, %rdx
-; AVX512-NEXT:    sarq $63, %rdi
-; AVX512-NEXT:    xorq %rdi, %rsi
-; AVX512-NEXT:    subq %rdi, %rsi
-; AVX512-NEXT:    vmovq %rsi, %xmm0
-; AVX512-NEXT:    vmovq %rdx, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vmovq %rcx, %xmm1
-; AVX512-NEXT:    vmovq %rax, %xmm2
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT:    vpminuq %ymm1, %ymm0, %ymm2
+; AVX512-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %aext = zext <4 x i64> %a to <4 x i128>
   %bext = zext <4 x i64> %b to <4 x i128>
@@ -1330,15 +434,15 @@ define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
 define <32 x i8> @abd_minmax_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX1-LABEL: abd_minmax_v32i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpminub %xmm3, %xmm4, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm3
 ; AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxub %xmm3, %xmm4, %xmm1
-; AVX1-NEXT:    vpsubb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v32i8:
@@ -1363,15 +467,15 @@ define <32 x i8> @abd_minmax_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 define <16 x i16> @abd_minmax_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: abd_minmax_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpminuw %xmm3, %xmm4, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm3
 ; AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxuw %xmm3, %xmm4, %xmm1
-; AVX1-NEXT:    vpsubw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v16i16:
@@ -1396,15 +500,15 @@ define <16 x i16> @abd_minmax_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 define <8 x i32> @abd_minmax_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX1-LABEL: abd_minmax_v8i32:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpminud %xmm3, %xmm4, %xmm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm4
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm3
 ; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsubd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxud %xmm3, %xmm4, %xmm1
-; AVX1-NEXT:    vpsubd %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: abd_minmax_v8i32:

diff  --git a/llvm/test/CodeGen/X86/abdu-vector-512.ll b/llvm/test/CodeGen/X86/abdu-vector-512.ll
index ab04fd62273a3..915e82f04f969 100644
--- a/llvm/test/CodeGen/X86/abdu-vector-512.ll
+++ b/llvm/test/CodeGen/X86/abdu-vector-512.ll
@@ -7,78 +7,25 @@
 ;
 
 define <64 x i8> @abd_ext_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
-; AVX512-LABEL: abd_ext_v64i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm7 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm10, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm9, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm9
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero,xmm9[8],zero,xmm9[9],zero,xmm9[10],zero,xmm9[11],zero,xmm9[12],zero,xmm9[13],zero,xmm9[14],zero,xmm9[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm10, %zmm5, %zmm5
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm9, %zmm4, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm10, %zmm7, %zmm7
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm9, %zmm6, %zmm6
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm9, %zmm8, %zmm8
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm3, %zmm1
-; AVX512-NEXT:    vpabsq %zmm2, %zmm2
-; AVX512-NEXT:    vpabsq %zmm5, %zmm3
-; AVX512-NEXT:    vpabsq %zmm4, %zmm4
-; AVX512-NEXT:    vpabsq %zmm7, %zmm5
-; AVX512-NEXT:    vpabsq %zmm6, %zmm6
-; AVX512-NEXT:    vpabsq %zmm8, %zmm7
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0]
-; AVX512-NEXT:    vpmovqb %zmm6, %xmm6
-; AVX512-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm5, %ymm0
-; AVX512-NEXT:    vpmovqb %zmm4, %xmm4
-; AVX512-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: abd_ext_v64i8:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: abd_ext_v64i8:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminub %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminub %ymm1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %aext = zext <64 x i8> %a to <64 x i64>
   %bext = zext <64 x i8> %b to <64 x i64>
   %sub = sub <64 x i64> %aext, %bext
@@ -88,78 +35,25 @@ define <64 x i8> @abd_ext_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 }
 
 define <64 x i8> @abd_ext_v64i8_undef(<64 x i8> %a, <64 x i8> %b) nounwind {
-; AVX512-LABEL: abd_ext_v64i8_undef:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm7 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm10, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm9, %zmm2, %zmm2
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm9
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero,xmm9[8],zero,xmm9[9],zero,xmm9[10],zero,xmm9[11],zero,xmm9[12],zero,xmm9[13],zero,xmm9[14],zero,xmm9[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm10, %zmm5, %zmm5
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm9, %zmm4, %zmm4
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm10 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm10, %zmm7, %zmm7
-; AVX512-NEXT:    vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm9, %zmm6, %zmm6
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm9, %zmm8, %zmm8
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm3, %zmm1
-; AVX512-NEXT:    vpabsq %zmm2, %zmm2
-; AVX512-NEXT:    vpabsq %zmm5, %zmm3
-; AVX512-NEXT:    vpabsq %zmm4, %zmm4
-; AVX512-NEXT:    vpabsq %zmm7, %zmm5
-; AVX512-NEXT:    vpabsq %zmm6, %zmm6
-; AVX512-NEXT:    vpabsq %zmm8, %zmm7
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqb %zmm7, %xmm7
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm7[0],xmm0[0]
-; AVX512-NEXT:    vpmovqb %zmm6, %xmm6
-; AVX512-NEXT:    vpmovqb %zmm5, %xmm5
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm5, %ymm0
-; AVX512-NEXT:    vpmovqb %zmm4, %xmm4
-; AVX512-NEXT:    vpmovqb %zmm3, %xmm3
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512-NEXT:    vpmovqb %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: abd_ext_v64i8_undef:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsubb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: abd_ext_v64i8_undef:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminub %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminub %ymm1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %aext = zext <64 x i8> %a to <64 x i64>
   %bext = zext <64 x i8> %b to <64 x i64>
   %sub = sub <64 x i64> %aext, %bext
@@ -169,38 +63,25 @@ define <64 x i8> @abd_ext_v64i8_undef(<64 x i8> %a, <64 x i8> %b) nounwind {
 }
 
 define <32 x i16> @abd_ext_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
-; AVX512-LABEL: abd_ext_v32i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm5, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm5, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm3, %zmm2
-; AVX512-NEXT:    vpabsq %zmm4, %zmm3
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpmovqw %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: abd_ext_v32i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsubw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: abd_ext_v32i16:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminuw %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxuw %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminuw %ymm1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %aext = zext <32 x i16> %a to <32 x i64>
   %bext = zext <32 x i16> %b to <32 x i64>
   %sub = sub <32 x i64> %aext, %bext
@@ -210,38 +91,25 @@ define <32 x i16> @abd_ext_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 }
 
 define <32 x i16> @abd_ext_v32i16_undef(<32 x i16> %a, <32 x i16> %b) nounwind {
-; AVX512-LABEL: abd_ext_v32i16_undef:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm5, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm5, %zmm3, %zmm3
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm5, %zmm4, %zmm4
-; AVX512-NEXT:    vpmovzxwq {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm3, %zmm2
-; AVX512-NEXT:    vpabsq %zmm4, %zmm3
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512-NEXT:    vpmovqw %zmm3, %xmm3
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; AVX512-NEXT:    vpmovqw %zmm2, %xmm2
-; AVX512-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: abd_ext_v32i16_undef:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsubw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: abd_ext_v32i16_undef:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminuw %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxuw %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminuw %ymm1, %ymm0, %ymm3
+; AVX512DQ-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    retq
   %aext = zext <32 x i16> %a to <32 x i64>
   %bext = zext <32 x i16> %b to <32 x i64>
   %sub = sub <32 x i64> %aext, %bext
@@ -253,19 +121,9 @@ define <32 x i16> @abd_ext_v32i16_undef(<32 x i16> %a, <32 x i16> %b) nounwind {
 define <16 x i32> @abd_ext_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; AVX512-LABEL: abd_ext_v16i32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubd %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %aext = zext <16 x i32> %a to <16 x i64>
   %bext = zext <16 x i32> %b to <16 x i64>
@@ -278,19 +136,9 @@ define <16 x i32> @abd_ext_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
 define <16 x i32> @abd_ext_v16i32_undef(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; AVX512-LABEL: abd_ext_v16i32_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero
-; AVX512-NEXT:    vpsubq %zmm3, %zmm2, %zmm2
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
-; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vpabsq %zmm2, %zmm1
-; AVX512-NEXT:    vpabsq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubd %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %aext = zext <16 x i32> %a to <16 x i64>
   %bext = zext <16 x i32> %b to <16 x i64>
@@ -303,108 +151,9 @@ define <16 x i32> @abd_ext_v16i32_undef(<16 x i32> %a, <16 x i32> %b) nounwind {
 define <8 x i64> @abd_ext_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; AVX512-LABEL: abd_ext_v8i64:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512-NEXT:    vmovq %xmm2, %rsi
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rdi
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, %r8
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r9
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rbx
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, %rbp
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r13
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512-NEXT:    vmovq %xmm2, %r15
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r14
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, %r11
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r10
-; AVX512-NEXT:    xorl %r12d, %r12d
-; AVX512-NEXT:    subq %r10, %r9
-; AVX512-NEXT:    movl $0, %r10d
-; AVX512-NEXT:    sbbq %r10, %r10
-; AVX512-NEXT:    subq %r11, %r8
-; AVX512-NEXT:    movl $0, %r11d
-; AVX512-NEXT:    sbbq %r11, %r11
-; AVX512-NEXT:    subq %r14, %rdi
-; AVX512-NEXT:    movl $0, %r14d
-; AVX512-NEXT:    sbbq %r14, %r14
-; AVX512-NEXT:    subq %r15, %rsi
-; AVX512-NEXT:    movl $0, %r15d
-; AVX512-NEXT:    sbbq %r15, %r15
-; AVX512-NEXT:    subq %r13, %rdx
-; AVX512-NEXT:    movl $0, %r13d
-; AVX512-NEXT:    sbbq %r13, %r13
-; AVX512-NEXT:    subq %rbp, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %ebp
-; AVX512-NEXT:    sbbq %rbp, %rbp
-; AVX512-NEXT:    subq %rbx, %rcx
-; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    sbbq %rax, %rax
-; AVX512-NEXT:    vmovq %xmm0, %rbx
-; AVX512-NEXT:    vmovq %xmm1, %rcx
-; AVX512-NEXT:    subq %rcx, %rbx
-; AVX512-NEXT:    sbbq %r12, %r12
-; AVX512-NEXT:    sarq $63, %r12
-; AVX512-NEXT:    xorq %r12, %rbx
-; AVX512-NEXT:    subq %r12, %rbx
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    xorq %rax, %rcx
-; AVX512-NEXT:    subq %rax, %rcx
-; AVX512-NEXT:    movq %rcx, %rax
-; AVX512-NEXT:    sarq $63, %rbp
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    xorq %rbp, %rcx
-; AVX512-NEXT:    subq %rbp, %rcx
-; AVX512-NEXT:    sarq $63, %r13
-; AVX512-NEXT:    xorq %r13, %rdx
-; AVX512-NEXT:    subq %r13, %rdx
-; AVX512-NEXT:    sarq $63, %r15
-; AVX512-NEXT:    xorq %r15, %rsi
-; AVX512-NEXT:    subq %r15, %rsi
-; AVX512-NEXT:    sarq $63, %r14
-; AVX512-NEXT:    xorq %r14, %rdi
-; AVX512-NEXT:    subq %r14, %rdi
-; AVX512-NEXT:    sarq $63, %r11
-; AVX512-NEXT:    xorq %r11, %r8
-; AVX512-NEXT:    subq %r11, %r8
-; AVX512-NEXT:    sarq $63, %r10
-; AVX512-NEXT:    xorq %r10, %r9
-; AVX512-NEXT:    subq %r10, %r9
-; AVX512-NEXT:    vmovq %r9, %xmm0
-; AVX512-NEXT:    vmovq %r8, %xmm1
-; AVX512-NEXT:    vmovq %rdi, %xmm2
-; AVX512-NEXT:    vmovq %rsi, %xmm3
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vmovq %rcx, %xmm5
-; AVX512-NEXT:    vmovq %rax, %xmm6
-; AVX512-NEXT:    vmovq %rbx, %xmm7
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %aext = zext <8 x i64> %a to <8 x i128>
   %bext = zext <8 x i64> %b to <8 x i128>
@@ -417,108 +166,9 @@ define <8 x i64> @abd_ext_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
 define <8 x i64> @abd_ext_v8i64_undef(<8 x i64> %a, <8 x i64> %b) nounwind {
 ; AVX512-LABEL: abd_ext_v8i64_undef:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    vpextrq $1, %xmm0, %rcx
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, %rax
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512-NEXT:    vmovq %xmm2, %rsi
-; AVX512-NEXT:    vpextrq $1, %xmm2, %rdi
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, %r8
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r9
-; AVX512-NEXT:    vpextrq $1, %xmm1, %rbx
-; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, %rbp
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r13
-; AVX512-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512-NEXT:    vmovq %xmm2, %r15
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r14
-; AVX512-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT:    vmovq %xmm2, %r11
-; AVX512-NEXT:    vpextrq $1, %xmm2, %r10
-; AVX512-NEXT:    xorl %r12d, %r12d
-; AVX512-NEXT:    subq %r10, %r9
-; AVX512-NEXT:    movl $0, %r10d
-; AVX512-NEXT:    sbbq %r10, %r10
-; AVX512-NEXT:    subq %r11, %r8
-; AVX512-NEXT:    movl $0, %r11d
-; AVX512-NEXT:    sbbq %r11, %r11
-; AVX512-NEXT:    subq %r14, %rdi
-; AVX512-NEXT:    movl $0, %r14d
-; AVX512-NEXT:    sbbq %r14, %r14
-; AVX512-NEXT:    subq %r15, %rsi
-; AVX512-NEXT:    movl $0, %r15d
-; AVX512-NEXT:    sbbq %r15, %r15
-; AVX512-NEXT:    subq %r13, %rdx
-; AVX512-NEXT:    movl $0, %r13d
-; AVX512-NEXT:    sbbq %r13, %r13
-; AVX512-NEXT:    subq %rbp, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %ebp
-; AVX512-NEXT:    sbbq %rbp, %rbp
-; AVX512-NEXT:    subq %rbx, %rcx
-; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movl $0, %eax
-; AVX512-NEXT:    sbbq %rax, %rax
-; AVX512-NEXT:    vmovq %xmm0, %rbx
-; AVX512-NEXT:    vmovq %xmm1, %rcx
-; AVX512-NEXT:    subq %rcx, %rbx
-; AVX512-NEXT:    sbbq %r12, %r12
-; AVX512-NEXT:    sarq $63, %r12
-; AVX512-NEXT:    xorq %r12, %rbx
-; AVX512-NEXT:    subq %r12, %rbx
-; AVX512-NEXT:    sarq $63, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    xorq %rax, %rcx
-; AVX512-NEXT:    subq %rax, %rcx
-; AVX512-NEXT:    movq %rcx, %rax
-; AVX512-NEXT:    sarq $63, %rbp
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    xorq %rbp, %rcx
-; AVX512-NEXT:    subq %rbp, %rcx
-; AVX512-NEXT:    sarq $63, %r13
-; AVX512-NEXT:    xorq %r13, %rdx
-; AVX512-NEXT:    subq %r13, %rdx
-; AVX512-NEXT:    sarq $63, %r15
-; AVX512-NEXT:    xorq %r15, %rsi
-; AVX512-NEXT:    subq %r15, %rsi
-; AVX512-NEXT:    sarq $63, %r14
-; AVX512-NEXT:    xorq %r14, %rdi
-; AVX512-NEXT:    subq %r14, %rdi
-; AVX512-NEXT:    sarq $63, %r11
-; AVX512-NEXT:    xorq %r11, %r8
-; AVX512-NEXT:    subq %r11, %r8
-; AVX512-NEXT:    sarq $63, %r10
-; AVX512-NEXT:    xorq %r10, %r9
-; AVX512-NEXT:    subq %r10, %r9
-; AVX512-NEXT:    vmovq %r9, %xmm0
-; AVX512-NEXT:    vmovq %r8, %xmm1
-; AVX512-NEXT:    vmovq %rdi, %xmm2
-; AVX512-NEXT:    vmovq %rsi, %xmm3
-; AVX512-NEXT:    vmovq %rdx, %xmm4
-; AVX512-NEXT:    vmovq %rcx, %xmm5
-; AVX512-NEXT:    vmovq %rax, %xmm6
-; AVX512-NEXT:    vmovq %rbx, %xmm7
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm4[0]
-; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm6[0]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
+; AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
 ; AVX512-NEXT:    retq
   %aext = zext <8 x i64> %a to <8 x i128>
   %bext = zext <8 x i64> %b to <8 x i128>
@@ -542,15 +192,15 @@ define <64 x i8> @abd_minmax_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512DQ-LABEL: abd_minmax_v64i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpminub %ymm1, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512DQ-NEXT:    vpminub %ymm3, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminub %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminub %ymm1, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsubb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmaxub %ymm3, %ymm4, %ymm1
-; AVX512DQ-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
   %min = call <64 x i8> @llvm.umin.v64i8(<64 x i8> %a, <64 x i8> %b)
   %max = call <64 x i8> @llvm.umax.v64i8(<64 x i8> %a, <64 x i8> %b)
@@ -568,15 +218,15 @@ define <32 x i16> @abd_minmax_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
 ;
 ; AVX512DQ-LABEL: abd_minmax_v32i16:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
-; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
-; AVX512DQ-NEXT:    vpminuw %ymm3, %ymm4, %ymm5
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT:    vpminuw %ymm2, %ymm3, %ymm4
+; AVX512DQ-NEXT:    vpmaxuw %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpminuw %ymm1, %ymm0, %ymm3
 ; AVX512DQ-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsubw %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmaxuw %ymm3, %ymm4, %ymm1
-; AVX512DQ-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
   %min = call <32 x i16> @llvm.umin.v32i16(<32 x i16> %a, <32 x i16> %b)
   %max = call <32 x i16> @llvm.umax.v32i16(<32 x i16> %a, <32 x i16> %b)

diff  --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index 7b54c1b355374..b00c0e6a09681 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -23,21 +23,21 @@
 define <8 x i32> @vec256_i32_signed_reg_reg(<8 x i32> %a1, <8 x i32> %a2) nounwind {
 ; AVX1-FALLBACK-LABEL: vec256_i32_signed_reg_reg:
 ; AVX1-FALLBACK:       # %bb.0:
-; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
+; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm3
+; AVX1-FALLBACK-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-FALLBACK-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
-; AVX1-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsubd %xmm5, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpminsd %xmm1, %xmm3, %xmm4
+; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm3, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpmulld %xmm1, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpmulld %xmm2, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-FALLBACK-NEXT:    vpmulld %xmm1, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-LABEL: vec256_i32_signed_reg_reg:
@@ -52,34 +52,34 @@ define <8 x i32> @vec256_i32_signed_reg_reg(<8 x i32> %a1, <8 x i32> %a2) nounwi
 ;
 ; XOP-FALLBACK-LABEL: vec256_i32_signed_reg_reg:
 ; XOP-FALLBACK:       # %bb.0:
-; XOP-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
-; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; XOP-FALLBACK-NEXT:    vpminsd %xmm3, %xmm4, %xmm5
+; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOP-FALLBACK-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm4
 ; XOP-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
-; XOP-FALLBACK-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpmaxsd %xmm3, %xmm4, %xmm2
-; XOP-FALLBACK-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpmacsdd %xmm4, %xmm2, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpmacsdd %xmm3, %xmm2, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpmacsdd %xmm0, %xmm1, %xmm1, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: vec256_i32_signed_reg_reg:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm2
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; XOPAVX1-NEXT:    vpminsd %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm4
 ; XOPAVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpmaxsd %xmm3, %xmm4, %xmm2
-; XOPAVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpmacsdd %xmm4, %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmacsdd %xmm3, %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmacsdd %xmm0, %xmm1, %xmm1, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
@@ -107,21 +107,21 @@ define <8 x i32> @vec256_i32_signed_reg_reg(<8 x i32> %a1, <8 x i32> %a2) nounwi
 define <8 x i32> @vec256_i32_unsigned_reg_reg(<8 x i32> %a1, <8 x i32> %a2) nounwind {
 ; AVX1-FALLBACK-LABEL: vec256_i32_unsigned_reg_reg:
 ; AVX1-FALLBACK:       # %bb.0:
-; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-FALLBACK-NEXT:    vpminud %xmm1, %xmm0, %xmm2
+; AVX1-FALLBACK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm3
+; AVX1-FALLBACK-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-FALLBACK-NEXT:    vpminud %xmm2, %xmm3, %xmm4
-; AVX1-FALLBACK-NEXT:    vpminud %xmm1, %xmm0, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsubd %xmm5, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpminud %xmm1, %xmm3, %xmm4
+; AVX1-FALLBACK-NEXT:    vpmaxud %xmm1, %xmm3, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpmulld %xmm1, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpmulld %xmm2, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-FALLBACK-NEXT:    vpmulld %xmm1, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-LABEL: vec256_i32_unsigned_reg_reg:
@@ -136,34 +136,34 @@ define <8 x i32> @vec256_i32_unsigned_reg_reg(<8 x i32> %a1, <8 x i32> %a2) noun
 ;
 ; XOP-FALLBACK-LABEL: vec256_i32_unsigned_reg_reg:
 ; XOP-FALLBACK:       # %bb.0:
-; XOP-FALLBACK-NEXT:    vpminud %xmm1, %xmm0, %xmm2
-; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; XOP-FALLBACK-NEXT:    vpminud %xmm3, %xmm4, %xmm5
+; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOP-FALLBACK-NEXT:    vpminud %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpminud %xmm1, %xmm0, %xmm4
 ; XOP-FALLBACK-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
-; XOP-FALLBACK-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpmaxud %xmm3, %xmm4, %xmm2
-; XOP-FALLBACK-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpmacsdd %xmm4, %xmm2, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpmacsdd %xmm3, %xmm2, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpmacsdd %xmm0, %xmm1, %xmm1, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: vec256_i32_unsigned_reg_reg:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm2
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; XOPAVX1-NEXT:    vpminud %xmm3, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm4
 ; XOPAVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpmaxud %xmm3, %xmm4, %xmm2
-; XOPAVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpmacsdd %xmm4, %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmacsdd %xmm3, %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmacsdd %xmm0, %xmm1, %xmm1, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
@@ -193,22 +193,22 @@ define <8 x i32> @vec256_i32_unsigned_reg_reg(<8 x i32> %a1, <8 x i32> %a2) noun
 define <8 x i32> @vec256_i32_signed_mem_reg(ptr %a1_addr, <8 x i32> %a2) nounwind {
 ; AVX1-FALLBACK-LABEL: vec256_i32_signed_mem_reg:
 ; AVX1-FALLBACK:       # %bb.0:
-; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm2
-; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX1-FALLBACK-NEXT:    vpminsd %xmm1, %xmm3, %xmm4
-; AVX1-FALLBACK-NEXT:    vpminsd %xmm0, %xmm2, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm3, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-FALLBACK-NEXT:    vpminsd %xmm0, %xmm1, %xmm3
+; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm4
+; AVX1-FALLBACK-NEXT:    vpsubd %xmm3, %xmm4, %xmm3
+; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-FALLBACK-NEXT:    vpminsd %xmm0, %xmm2, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm0, %xmm2, %xmm0
-; AVX1-FALLBACK-NEXT:    vpsubd %xmm5, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm0, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT:    vpmulld %xmm3, %xmm3, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpmulld %xmm0, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT:    vpmulld %xmm1, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-FALLBACK-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-LABEL: vec256_i32_signed_mem_reg:
@@ -224,38 +224,38 @@ define <8 x i32> @vec256_i32_signed_mem_reg(ptr %a1_addr, <8 x i32> %a2) nounwin
 ;
 ; XOP-FALLBACK-LABEL: vec256_i32_signed_mem_reg:
 ; XOP-FALLBACK:       # %bb.0:
-; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
-; XOP-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; XOP-FALLBACK-NEXT:    vpminsd %xmm0, %xmm1, %xmm3
-; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; XOP-FALLBACK-NEXT:    vpminsd %xmm4, %xmm2, %xmm5
-; XOP-FALLBACK-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
-; XOP-FALLBACK-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT:    vpmaxsd %xmm4, %xmm2, %xmm3
-; XOP-FALLBACK-NEXT:    vpsubd %xmm5, %xmm3, %xmm3
+; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm2
+; XOP-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
+; XOP-FALLBACK-NEXT:    vpminsd %xmm1, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm3, %xmm1
+; XOP-FALLBACK-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpminsd %xmm0, %xmm2, %xmm4
+; XOP-FALLBACK-NEXT:    vpmaxsd %xmm0, %xmm2, %xmm0
+; XOP-FALLBACK-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm3, %xmm3
-; XOP-FALLBACK-NEXT:    vpmacsdd %xmm2, %xmm3, %xmm3, %xmm2
-; XOP-FALLBACK-NEXT:    vpmacsdd %xmm1, %xmm0, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpmacsdd %xmm3, %xmm1, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpmacsdd %xmm2, %xmm0, %xmm0, %xmm0
+; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: vec256_i32_signed_mem_reg:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm1
-; XOPAVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
-; XOPAVX1-NEXT:    vpminsd %xmm0, %xmm1, %xmm3
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; XOPAVX1-NEXT:    vpminsd %xmm4, %xmm2, %xmm5
-; XOPAVX1-NEXT:    vpmaxsd %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpmaxsd %xmm4, %xmm2, %xmm3
-; XOPAVX1-NEXT:    vpsubd %xmm5, %xmm3, %xmm3
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm2
+; XOPAVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
+; XOPAVX1-NEXT:    vpminsd %xmm1, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpmaxsd %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpminsd %xmm0, %xmm2, %xmm4
+; XOPAVX1-NEXT:    vpmaxsd %xmm0, %xmm2, %xmm0
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; XOPAVX1-NEXT:    vpmacsdd %xmm2, %xmm3, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpmacsdd %xmm1, %xmm0, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; XOPAVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpmacsdd %xmm3, %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpmacsdd %xmm2, %xmm0, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: vec256_i32_signed_mem_reg:
@@ -285,13 +285,13 @@ define <8 x i32> @vec256_i32_signed_reg_mem(<8 x i32> %a1, ptr %a2_addr) nounwin
 ; AVX1-FALLBACK:       # %bb.0:
 ; AVX1-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
+; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
-; AVX1-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm5
 ; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsubd %xmm5, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpmulld %xmm1, %xmm1, %xmm1
@@ -316,16 +316,16 @@ define <8 x i32> @vec256_i32_signed_reg_mem(<8 x i32> %a1, ptr %a2_addr) nounwin
 ; XOP-FALLBACK:       # %bb.0:
 ; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; XOP-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm2
-; XOP-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
-; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; XOP-FALLBACK-NEXT:    vpminsd %xmm2, %xmm4, %xmm5
+; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOP-FALLBACK-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpminsd %xmm1, %xmm0, %xmm4
 ; XOP-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
-; XOP-FALLBACK-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpmaxsd %xmm2, %xmm4, %xmm2
-; XOP-FALLBACK-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpmacsdd %xmm4, %xmm2, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpmacsdd %xmm3, %xmm2, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpmacsdd %xmm0, %xmm1, %xmm1, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
@@ -334,16 +334,16 @@ define <8 x i32> @vec256_i32_signed_reg_mem(<8 x i32> %a1, ptr %a2_addr) nounwin
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm1
 ; XOPAVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
-; XOPAVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm3
-; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; XOPAVX1-NEXT:    vpminsd %xmm2, %xmm4, %xmm5
+; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm4
 ; XOPAVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpmaxsd %xmm2, %xmm4, %xmm2
-; XOPAVX1-NEXT:    vpsubd %xmm5, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpmacsdd %xmm4, %xmm2, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmacsdd %xmm3, %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpmacsdd %xmm0, %xmm1, %xmm1, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
@@ -377,12 +377,12 @@ define <8 x i32> @vec256_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rsi), %xmm1
 ; AVX1-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX1-FALLBACK-NEXT:    vpminsd %xmm0, %xmm2, %xmm4
+; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpminsd %xmm1, %xmm3, %xmm4
-; AVX1-FALLBACK-NEXT:    vpminsd %xmm0, %xmm2, %xmm5
 ; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm3, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpmaxsd %xmm0, %xmm2, %xmm0
-; AVX1-FALLBACK-NEXT:    vpsubd %xmm5, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsrld $1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpmulld %xmm0, %xmm0, %xmm0
@@ -410,12 +410,12 @@ define <8 x i32> @vec256_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-FALLBACK-NEXT:    vmovdqa 16(%rsi), %xmm1
 ; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm2
 ; XOP-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
+; XOP-FALLBACK-NEXT:    vpminsd %xmm1, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm3, %xmm1
+; XOP-FALLBACK-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpminsd %xmm0, %xmm2, %xmm4
-; XOP-FALLBACK-NEXT:    vpminsd %xmm1, %xmm3, %xmm5
 ; XOP-FALLBACK-NEXT:    vpmaxsd %xmm0, %xmm2, %xmm0
 ; XOP-FALLBACK-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT:    vpmaxsd %xmm1, %xmm3, %xmm1
-; XOP-FALLBACK-NEXT:    vpsubd %xmm5, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpmacsdd %xmm3, %xmm1, %xmm1, %xmm1
@@ -429,12 +429,12 @@ define <8 x i32> @vec256_i32_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOPAVX1-NEXT:    vmovdqa 16(%rsi), %xmm1
 ; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm2
 ; XOPAVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
+; XOPAVX1-NEXT:    vpminsd %xmm1, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpmaxsd %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpminsd %xmm0, %xmm2, %xmm4
-; XOPAVX1-NEXT:    vpminsd %xmm1, %xmm3, %xmm5
 ; XOPAVX1-NEXT:    vpmaxsd %xmm0, %xmm2, %xmm0
 ; XOPAVX1-NEXT:    vpsubd %xmm4, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpmaxsd %xmm1, %xmm3, %xmm1
-; XOPAVX1-NEXT:    vpsubd %xmm5, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpmacsdd %xmm3, %xmm1, %xmm1, %xmm1
@@ -1557,12 +1557,12 @@ define <16 x i16> @vec256_i16_signed_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nou
 ; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpcmpgtw %xmm2, %xmm3, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm5
+; AVX1-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm6
+; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpminsw %xmm2, %xmm3, %xmm6
-; AVX1-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm7
 ; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsubw %xmm6, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsubw %xmm7, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -1593,12 +1593,12 @@ define <16 x i16> @vec256_i16_signed_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nou
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-FALLBACK-NEXT:    vpcomgtw %xmm2, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm5
+; XOP-FALLBACK-NEXT:    vpminsw %xmm2, %xmm3, %xmm6
+; XOP-FALLBACK-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpsubw %xmm6, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm6
-; XOP-FALLBACK-NEXT:    vpminsw %xmm2, %xmm3, %xmm7
 ; XOP-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
-; XOP-FALLBACK-NEXT:    vpsubw %xmm7, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -1615,12 +1615,12 @@ define <16 x i16> @vec256_i16_signed_reg_reg(<16 x i16> %a1, <16 x i16> %a2) nou
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOPAVX1-NEXT:    vpcomgtw %xmm2, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm5
+; XOPAVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm6
+; XOPAVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpsubw %xmm6, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm6
-; XOPAVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm7
 ; XOPAVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
 ; XOPAVX1-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpsubw %xmm7, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -1705,9 +1705,9 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
 ; AVX1-FALLBACK-NEXT:    vpminuw %xmm1, %xmm0, %xmm7
 ; AVX1-FALLBACK-NEXT:    vpcmpeqw %xmm7, %xmm0, %xmm8
 ; AVX1-FALLBACK-NEXT:    vpxor %xmm6, %xmm8, %xmm6
-; AVX1-FALLBACK-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubw %xmm7, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsubw %xmm4, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
@@ -1741,12 +1741,12 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-FALLBACK-NEXT:    vpcomgtuw %xmm2, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpcomgtuw %xmm1, %xmm0, %xmm5
+; XOP-FALLBACK-NEXT:    vpminuw %xmm2, %xmm3, %xmm6
+; XOP-FALLBACK-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpsubw %xmm6, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpminuw %xmm1, %xmm0, %xmm6
-; XOP-FALLBACK-NEXT:    vpminuw %xmm2, %xmm3, %xmm7
 ; XOP-FALLBACK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
-; XOP-FALLBACK-NEXT:    vpsubw %xmm7, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -1763,12 +1763,12 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOPAVX1-NEXT:    vpcomgtuw %xmm2, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpcomgtuw %xmm1, %xmm0, %xmm5
+; XOPAVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm6
+; XOPAVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpsubw %xmm6, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm6
-; XOPAVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm7
 ; XOPAVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
 ; XOPAVX1-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpsubw %xmm7, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -1853,12 +1853,12 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw
 ; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX1-FALLBACK-NEXT:    vpcmpgtw %xmm1, %xmm3, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm5
+; AVX1-FALLBACK-NEXT:    vpminsw %xmm0, %xmm2, %xmm6
+; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsubw %xmm6, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpminsw %xmm1, %xmm3, %xmm6
-; AVX1-FALLBACK-NEXT:    vpminsw %xmm0, %xmm2, %xmm7
 ; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm3, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm0, %xmm2, %xmm0
-; AVX1-FALLBACK-NEXT:    vpsubw %xmm7, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -1891,12 +1891,12 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw
 ; XOP-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOP-FALLBACK-NEXT:    vpcomgtw %xmm1, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpcomgtw %xmm0, %xmm2, %xmm5
+; XOP-FALLBACK-NEXT:    vpminsw %xmm1, %xmm3, %xmm6
+; XOP-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm3, %xmm1
+; XOP-FALLBACK-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpminsw %xmm0, %xmm2, %xmm6
-; XOP-FALLBACK-NEXT:    vpminsw %xmm1, %xmm3, %xmm7
 ; XOP-FALLBACK-NEXT:    vpmaxsw %xmm0, %xmm2, %xmm0
 ; XOP-FALLBACK-NEXT:    vpsubw %xmm6, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm3, %xmm1
-; XOP-FALLBACK-NEXT:    vpsubw %xmm7, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -1914,12 +1914,12 @@ define <16 x i16> @vec256_i16_signed_mem_reg(ptr %a1_addr, <16 x i16> %a2) nounw
 ; XOPAVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOPAVX1-NEXT:    vpcomgtw %xmm1, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpcomgtw %xmm0, %xmm2, %xmm5
+; XOPAVX1-NEXT:    vpminsw %xmm1, %xmm3, %xmm6
+; XOPAVX1-NEXT:    vpmaxsw %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpminsw %xmm0, %xmm2, %xmm6
-; XOPAVX1-NEXT:    vpminsw %xmm1, %xmm3, %xmm7
 ; XOPAVX1-NEXT:    vpmaxsw %xmm0, %xmm2, %xmm0
 ; XOPAVX1-NEXT:    vpsubw %xmm6, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpmaxsw %xmm1, %xmm3, %xmm1
-; XOPAVX1-NEXT:    vpsubw %xmm7, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -2004,12 +2004,12 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw
 ; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpcmpgtw %xmm2, %xmm3, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm5
+; AVX1-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm6
+; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpminsw %xmm2, %xmm3, %xmm6
-; AVX1-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm7
 ; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsubw %xmm6, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsubw %xmm7, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -2042,12 +2042,12 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-FALLBACK-NEXT:    vpcomgtw %xmm2, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm5
+; XOP-FALLBACK-NEXT:    vpminsw %xmm2, %xmm3, %xmm6
+; XOP-FALLBACK-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; XOP-FALLBACK-NEXT:    vpsubw %xmm6, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpminsw %xmm1, %xmm0, %xmm6
-; XOP-FALLBACK-NEXT:    vpminsw %xmm2, %xmm3, %xmm7
 ; XOP-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
-; XOP-FALLBACK-NEXT:    vpsubw %xmm7, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -2065,12 +2065,12 @@ define <16 x i16> @vec256_i16_signed_reg_mem(<16 x i16> %a1, ptr %a2_addr) nounw
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOPAVX1-NEXT:    vpcomgtw %xmm2, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpcomgtw %xmm1, %xmm0, %xmm5
+; XOPAVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm6
+; XOPAVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; XOPAVX1-NEXT:    vpsubw %xmm6, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm6
-; XOPAVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm7
 ; XOPAVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm1
 ; XOPAVX1-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT:    vpsubw %xmm7, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -2156,12 +2156,12 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
 ; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX1-FALLBACK-NEXT:    vpcmpgtw %xmm1, %xmm3, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm5
+; AVX1-FALLBACK-NEXT:    vpminsw %xmm0, %xmm2, %xmm6
+; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsubw %xmm6, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpminsw %xmm1, %xmm3, %xmm6
-; AVX1-FALLBACK-NEXT:    vpminsw %xmm0, %xmm2, %xmm7
 ; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm3, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpmaxsw %xmm0, %xmm2, %xmm0
-; AVX1-FALLBACK-NEXT:    vpsubw %xmm7, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -2196,12 +2196,12 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
 ; XOP-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOP-FALLBACK-NEXT:    vpcomgtw %xmm1, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpcomgtw %xmm0, %xmm2, %xmm5
+; XOP-FALLBACK-NEXT:    vpminsw %xmm1, %xmm3, %xmm6
+; XOP-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm3, %xmm1
+; XOP-FALLBACK-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpminsw %xmm0, %xmm2, %xmm6
-; XOP-FALLBACK-NEXT:    vpminsw %xmm1, %xmm3, %xmm7
 ; XOP-FALLBACK-NEXT:    vpmaxsw %xmm0, %xmm2, %xmm0
 ; XOP-FALLBACK-NEXT:    vpsubw %xmm6, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT:    vpmaxsw %xmm1, %xmm3, %xmm1
-; XOP-FALLBACK-NEXT:    vpsubw %xmm7, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -2220,12 +2220,12 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
 ; XOPAVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOPAVX1-NEXT:    vpcomgtw %xmm1, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpcomgtw %xmm0, %xmm2, %xmm5
+; XOPAVX1-NEXT:    vpminsw %xmm1, %xmm3, %xmm6
+; XOPAVX1-NEXT:    vpmaxsw %xmm1, %xmm3, %xmm1
+; XOPAVX1-NEXT:    vpsubw %xmm6, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpminsw %xmm0, %xmm2, %xmm6
-; XOPAVX1-NEXT:    vpminsw %xmm1, %xmm3, %xmm7
 ; XOPAVX1-NEXT:    vpmaxsw %xmm0, %xmm2, %xmm0
 ; XOPAVX1-NEXT:    vpsubw %xmm6, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpmaxsw %xmm1, %xmm3, %xmm1
-; XOPAVX1-NEXT:    vpsubw %xmm7, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1]
@@ -2319,12 +2319,12 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpcmpgtb %xmm3, %xmm2, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm5
+; AVX1-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm6
+; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpminsb %xmm3, %xmm2, %xmm6
-; AVX1-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm7
 ; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm3, %xmm2, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpsubb %xmm6, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsubb %xmm7, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm3, %xmm3
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX1-FALLBACK-NEXT:    vpand %xmm6, %xmm3, %xmm3
@@ -2385,12 +2385,12 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-FALLBACK-NEXT:    vpcomgtb %xmm2, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm5
+; XOP-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm6
+; XOP-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
+; XOP-FALLBACK-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpminsb %xmm2, %xmm3, %xmm6
-; XOP-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm7
 ; XOP-FALLBACK-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
-; XOP-FALLBACK-NEXT:    vpsubb %xmm7, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
@@ -2423,12 +2423,12 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOPAVX1-NEXT:    vpcomgtb %xmm2, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm5
+; XOPAVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm6
+; XOPAVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm6
-; XOPAVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm7
 ; XOPAVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
 ; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT:    vpsubb %xmm7, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
@@ -2544,9 +2544,9 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; AVX1-FALLBACK-NEXT:    vpminub %xmm1, %xmm0, %xmm7
 ; AVX1-FALLBACK-NEXT:    vpcmpeqb %xmm7, %xmm0, %xmm8
 ; AVX1-FALLBACK-NEXT:    vpxor %xmm6, %xmm8, %xmm6
-; AVX1-FALLBACK-NEXT:    vpmaxub %xmm3, %xmm2, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubb %xmm7, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpmaxub %xmm3, %xmm2, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpsubb %xmm4, %xmm3, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm3, %xmm3
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
@@ -2610,12 +2610,12 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-FALLBACK-NEXT:    vpcomgtub %xmm2, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpcomgtub %xmm1, %xmm0, %xmm5
+; XOP-FALLBACK-NEXT:    vpminub %xmm1, %xmm0, %xmm6
+; XOP-FALLBACK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
+; XOP-FALLBACK-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpminub %xmm2, %xmm3, %xmm6
-; XOP-FALLBACK-NEXT:    vpminub %xmm1, %xmm0, %xmm7
 ; XOP-FALLBACK-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
-; XOP-FALLBACK-NEXT:    vpsubb %xmm7, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
@@ -2648,12 +2648,12 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOPAVX1-NEXT:    vpcomgtub %xmm2, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpcomgtub %xmm1, %xmm0, %xmm5
+; XOPAVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm6
+; XOPAVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm6
-; XOPAVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm7
 ; XOPAVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
 ; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT:    vpsubb %xmm7, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
@@ -2769,12 +2769,12 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX1-FALLBACK-NEXT:    vpcmpgtb %xmm3, %xmm2, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm5
+; AVX1-FALLBACK-NEXT:    vpminsb %xmm0, %xmm1, %xmm6
+; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsubb %xmm6, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpminsb %xmm3, %xmm2, %xmm6
-; AVX1-FALLBACK-NEXT:    vpminsb %xmm0, %xmm1, %xmm7
 ; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm3, %xmm2, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpsubb %xmm6, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
-; AVX1-FALLBACK-NEXT:    vpsubb %xmm7, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm3, %xmm3
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX1-FALLBACK-NEXT:    vpand %xmm6, %xmm3, %xmm3
@@ -2837,12 +2837,12 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; XOP-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOP-FALLBACK-NEXT:    vpcomgtb %xmm2, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpcomgtb %xmm0, %xmm1, %xmm5
+; XOP-FALLBACK-NEXT:    vpminsb %xmm0, %xmm1, %xmm6
+; XOP-FALLBACK-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT:    vpsubb %xmm6, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vpminsb %xmm2, %xmm3, %xmm6
-; XOP-FALLBACK-NEXT:    vpminsb %xmm0, %xmm1, %xmm7
 ; XOP-FALLBACK-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
-; XOP-FALLBACK-NEXT:    vpsubb %xmm7, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm6, %xmm0, %xmm0
@@ -2876,12 +2876,12 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; XOPAVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOPAVX1-NEXT:    vpcomgtb %xmm2, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpcomgtb %xmm0, %xmm1, %xmm5
+; XOPAVX1-NEXT:    vpminsb %xmm0, %xmm1, %xmm6
+; XOPAVX1-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm6
-; XOPAVX1-NEXT:    vpminsb %xmm0, %xmm1, %xmm7
 ; XOPAVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
 ; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpmaxsb %xmm0, %xmm1, %xmm0
-; XOPAVX1-NEXT:    vpsubb %xmm7, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm0, %xmm0
@@ -2997,12 +2997,12 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpcmpgtb %xmm2, %xmm0, %xmm5
+; AVX1-FALLBACK-NEXT:    vpminsb %xmm2, %xmm0, %xmm6
+; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpminsb %xmm3, %xmm1, %xmm6
-; AVX1-FALLBACK-NEXT:    vpminsb %xmm2, %xmm0, %xmm7
 ; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm3, %xmm1, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpsubb %xmm6, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsubb %xmm7, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm3, %xmm3
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX1-FALLBACK-NEXT:    vpand %xmm6, %xmm3, %xmm3
@@ -3065,12 +3065,12 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOP-FALLBACK-NEXT:    vpcomgtb %xmm2, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm5
+; XOP-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm6
+; XOP-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
+; XOP-FALLBACK-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpminsb %xmm2, %xmm3, %xmm6
-; XOP-FALLBACK-NEXT:    vpminsb %xmm1, %xmm0, %xmm7
 ; XOP-FALLBACK-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
-; XOP-FALLBACK-NEXT:    vpsubb %xmm7, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
@@ -3104,12 +3104,12 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; XOPAVX1-NEXT:    vpcomgtb %xmm2, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpcomgtb %xmm1, %xmm0, %xmm5
+; XOPAVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm6
+; XOPAVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
+; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm6
-; XOPAVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm7
 ; XOPAVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
 ; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm1
-; XOPAVX1-NEXT:    vpsubb %xmm7, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
@@ -3226,12 +3226,12 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX1-FALLBACK-NEXT:    vpcmpgtb %xmm3, %xmm1, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpcmpgtb %xmm2, %xmm0, %xmm5
+; AVX1-FALLBACK-NEXT:    vpminsb %xmm2, %xmm0, %xmm6
+; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsubb %xmm6, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpminsb %xmm3, %xmm1, %xmm6
-; AVX1-FALLBACK-NEXT:    vpminsb %xmm2, %xmm0, %xmm7
 ; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm3, %xmm1, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpsubb %xmm6, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsubb %xmm7, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsrlw $1, %xmm3, %xmm3
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
 ; AVX1-FALLBACK-NEXT:    vpand %xmm6, %xmm3, %xmm3
@@ -3296,12 +3296,12 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOP-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOP-FALLBACK-NEXT:    vpcomgtb %xmm1, %xmm3, %xmm4
 ; XOP-FALLBACK-NEXT:    vpcomgtb %xmm0, %xmm2, %xmm5
+; XOP-FALLBACK-NEXT:    vpminsb %xmm0, %xmm2, %xmm6
+; XOP-FALLBACK-NEXT:    vpmaxsb %xmm0, %xmm2, %xmm0
+; XOP-FALLBACK-NEXT:    vpsubb %xmm6, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vpminsb %xmm1, %xmm3, %xmm6
-; XOP-FALLBACK-NEXT:    vpminsb %xmm0, %xmm2, %xmm7
 ; XOP-FALLBACK-NEXT:    vpmaxsb %xmm1, %xmm3, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpmaxsb %xmm0, %xmm2, %xmm0
-; XOP-FALLBACK-NEXT:    vpsubb %xmm7, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpshlb %xmm6, %xmm0, %xmm0
@@ -3336,12 +3336,12 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; XOPAVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; XOPAVX1-NEXT:    vpcomgtb %xmm1, %xmm3, %xmm4
 ; XOPAVX1-NEXT:    vpcomgtb %xmm0, %xmm2, %xmm5
+; XOPAVX1-NEXT:    vpminsb %xmm0, %xmm2, %xmm6
+; XOPAVX1-NEXT:    vpmaxsb %xmm0, %xmm2, %xmm0
+; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpminsb %xmm1, %xmm3, %xmm6
-; XOPAVX1-NEXT:    vpminsb %xmm0, %xmm2, %xmm7
 ; XOPAVX1-NEXT:    vpmaxsb %xmm1, %xmm3, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm6, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpmaxsb %xmm0, %xmm2, %xmm0
-; XOPAVX1-NEXT:    vpsubb %xmm7, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
 ; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpshlb %xmm6, %xmm0, %xmm0

diff  --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 3f4116455c3d0..b032ebf009084 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -275,12 +275,12 @@ define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nou
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpminsw %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubw %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpminsw %ymm1, %ymm0, %ymm5
-; AVX512F-NEXT:    vpminsw %ymm2, %ymm3, %ymm6
 ; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubw %ymm6, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
@@ -303,12 +303,12 @@ define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nou
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm0, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm2, %ymm3, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm6, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
@@ -356,9 +356,9 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n
 ; AVX512F-NEXT:    vpminuw %ymm1, %ymm0, %ymm6
 ; AVX512F-NEXT:    vpcmpeqw %ymm6, %ymm0, %ymm7
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512F-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpmaxuw %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
@@ -384,9 +384,9 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n
 ; AVX512VL-FALLBACK-NEXT:    vpminuw %ymm1, %ymm0, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %ymm6, %ymm0, %ymm7
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpmaxuw %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
@@ -436,12 +436,12 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpminsw %ymm1, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpminsw %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT:    vpminsw %ymm1, %ymm3, %ymm6
 ; AVX512F-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    vpsubw %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -465,12 +465,12 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm3, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -519,12 +519,12 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounw
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpminsw %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubw %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpminsw %ymm1, %ymm0, %ymm5
-; AVX512F-NEXT:    vpminsw %ymm2, %ymm3, %ymm6
 ; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubw %ymm6, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
@@ -548,12 +548,12 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm0, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm2, %ymm3, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm6, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
@@ -603,12 +603,12 @@ define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpminsw %ymm1, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpminsw %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT:    vpminsw %ymm1, %ymm3, %ymm6
 ; AVX512F-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    vpsubw %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -633,12 +633,12 @@ define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm3, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm6, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -694,12 +694,12 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
-; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm6
 ; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubb %ymm6, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
@@ -723,12 +723,12 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
@@ -778,9 +778,9 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
 ; AVX512F-NEXT:    vpminub %ymm1, %ymm0, %ymm6
 ; AVX512F-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm7
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512F-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
@@ -807,9 +807,9 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpminub %ymm1, %ymm0, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm6, %ymm0, %ymm7
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
@@ -861,12 +861,12 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpminsb %ymm1, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpminsb %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT:    vpminsb %ymm1, %ymm3, %ymm6
 ; AVX512F-NEXT:    vpmaxsb %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -891,12 +891,12 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm3, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm3, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm0, %ymm2, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -947,12 +947,12 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
-; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm6
 ; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubb %ymm6, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
@@ -977,12 +977,12 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
@@ -1034,12 +1034,12 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm3, %ymm4
 ; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpminsb %ymm1, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpminsb %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT:    vpminsb %ymm1, %ymm3, %ymm6
 ; AVX512F-NEXT:    vpmaxsb %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -1065,12 +1065,12 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm3, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm3, %ymm6
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm0, %ymm2, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm6, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0


        


More information about the llvm-commits mailing list