[llvm] bc78bae - [X86] Improve combineVectorShiftImm

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 13 08:32:17 PDT 2020


Author: Jay Foad
Date: 2020-04-13T15:54:55+01:00
New Revision: bc78baec4cc3ac95a931f4708f1534af7ff77de5

URL: https://github.com/llvm/llvm-project/commit/bc78baec4cc3ac95a931f4708f1534af7ff77de5
DIFF: https://github.com/llvm/llvm-project/commit/bc78baec4cc3ac95a931f4708f1534af7ff77de5.diff

LOG: [X86] Improve combineVectorShiftImm

Summary:
Fold (shift (shift X, C2), C1) -> (shift X, (C1 + C2)) for logical as
well as arithmetic shifts. This is needed to prevent regressions from
an upcoming funnel shift expansion change.

While we're here, fold (VSRAI -1, C) -> -1 too.

Reviewers: RKSimon, craig.topper

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D77300

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
    llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
    llvm/test/CodeGen/X86/midpoint-int-vec-512.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 50abd75e87b9..3dce82fa7228 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41084,26 +41084,37 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
   if (ShiftVal >= NumBitsPerElt) {
     if (LogicalShift)
       return DAG.getConstant(0, SDLoc(N), VT);
-    else
-      ShiftVal = NumBitsPerElt - 1;
+    ShiftVal = NumBitsPerElt - 1;
   }
 
-  // Shift N0 by zero -> N0.
+  // (shift X, 0) -> X
   if (!ShiftVal)
     return N0;
 
-  // Shift zero -> zero.
+  // (shift 0, C) -> 0
   if (ISD::isBuildVectorAllZeros(N0.getNode()))
+    // N0 is all zeros or undef. We guarantee that the bits shifted into the
+    // result are all zeros, not undef.
     return DAG.getConstant(0, SDLoc(N), VT);
 
-  // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
-  // clamped to (NumBitsPerElt - 1).
-  if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
+  // (VSRAI -1, C) -> -1
+  if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
+    // N0 is all ones or undef. We guarantee that the bits shifted into the
+    // result are all ones, not undef.
+    return DAG.getConstant(-1, SDLoc(N), VT);
+
+  // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
+  if (Opcode == N0.getOpcode()) {
     unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
     unsigned NewShiftVal = ShiftVal + ShiftVal2;
-    if (NewShiftVal >= NumBitsPerElt)
+    if (NewShiftVal >= NumBitsPerElt) {
+      // Out of range logical bit shifts are guaranteed to be zero.
+      // Out of range arithmetic bit shifts splat the sign bit.
+      if (LogicalShift)
+        return DAG.getConstant(0, SDLoc(N), VT);
       NewShiftVal = NumBitsPerElt - 1;
-    return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
+    }
+    return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
                        DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
   }
 

diff  --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index 7f0e19e58a0e..1d148cb54e96 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -886,18 +886,18 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; SSE2-NEXT:    pxor %xmm0, %xmm4
 ; SSE2-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1]
-; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
@@ -905,22 +905,22 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; SSE2-NEXT:    pandn %xmm1, %xmm5
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psubq %xmm5, %xmm2
-; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psubq %xmm5, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    psrlq $33, %xmm3
+; SSE2-NEXT:    pmuludq %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    psrlq $32, %xmm1
-; SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; SSE2-NEXT:    paddq %xmm4, %xmm1
+; SSE2-NEXT:    pmuludq %xmm4, %xmm1
+; SSE2-NEXT:    paddq %xmm3, %xmm1
 ; SSE2-NEXT:    psllq $32, %xmm1
-; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE2-NEXT:    paddq %xmm0, %xmm1
-; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    paddq %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -938,32 +938,31 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm4, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [1,1]
-; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT:    por %xmm7, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
+; SSE41-NEXT:    por %xmm3, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pand %xmm6, %xmm0
 ; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    movdqa %xmm1, %xmm5
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    psubq %xmm5, %xmm1
-; SSE41-NEXT:    psrlq $1, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrlq $1, %xmm3
+; SSE41-NEXT:    psrlq $33, %xmm1
+; SSE41-NEXT:    pmuludq %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psrlq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm3, %xmm0
+; SSE41-NEXT:    paddq %xmm1, %xmm0
+; SSE41-NEXT:    psllq $32, %xmm0
 ; SSE41-NEXT:    pmuludq %xmm3, %xmm4
-; SSE41-NEXT:    paddq %xmm0, %xmm4
-; SSE41-NEXT:    psllq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; SSE41-NEXT:    paddq %xmm2, %xmm4
-; SSE41-NEXT:    paddq %xmm4, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    paddq %xmm2, %xmm0
+; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-FALLBACK-LABEL: vec128_i64_signed_reg_reg:
@@ -974,16 +973,16 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_reg_reg:
@@ -994,16 +993,16 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX2-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
 ; AVX2-FALLBACK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX2-FALLBACK-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX2-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX2-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX2-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX2-FALLBACK-NEXT:    retq
 ;
 ; XOP-LABEL: vec128_i64_signed_reg_reg:
@@ -1014,16 +1013,16 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; XOP-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
 ; XOP-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
 ; XOP-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; XOP-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; XOP-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; XOP-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; XOP-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; XOP-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; XOP-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOP-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec128_i64_signed_reg_reg:
@@ -1037,16 +1036,16 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpsrlq $32, %xmm1, %xmm2
-; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX512F-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
-; AVX512F-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX512F-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512F-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512F-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1059,16 +1058,16 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX512VL-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
 ; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX512VL-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX512VL-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512VL-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512VL-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_reg_reg:
@@ -1082,16 +1081,16 @@ define <2 x i64> @vec128_i64_signed_reg_reg(<2 x i64> %a1, <2 x i64> %a2) nounwi
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vzeroupper
 ; AVX512BW-FALLBACK-NEXT:    retq
   %t3 = icmp sgt <2 x i64> %a1, %a2 ; signed
@@ -1114,18 +1113,18 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; SSE2-NEXT:    pxor %xmm0, %xmm4
 ; SSE2-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1]
-; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
@@ -1133,22 +1132,22 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; SSE2-NEXT:    pandn %xmm1, %xmm5
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psubq %xmm5, %xmm2
-; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psubq %xmm5, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    psrlq $33, %xmm3
+; SSE2-NEXT:    pmuludq %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    psrlq $32, %xmm1
-; SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; SSE2-NEXT:    paddq %xmm4, %xmm1
+; SSE2-NEXT:    pmuludq %xmm4, %xmm1
+; SSE2-NEXT:    paddq %xmm3, %xmm1
 ; SSE2-NEXT:    psllq $32, %xmm1
-; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE2-NEXT:    paddq %xmm0, %xmm1
-; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    paddq %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1166,32 +1165,31 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; SSE41-NEXT:    pcmpeqd %xmm5, %xmm6
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
 ; SSE41-NEXT:    pand %xmm4, %xmm7
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE41-NEXT:    por %xmm7, %xmm4
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [1,1]
-; SSE41-NEXT:    por %xmm4, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT:    por %xmm7, %xmm3
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1,1]
+; SSE41-NEXT:    por %xmm3, %xmm4
 ; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pand %xmm6, %xmm0
 ; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    movdqa %xmm1, %xmm5
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    psubq %xmm5, %xmm1
-; SSE41-NEXT:    psrlq $1, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrlq $1, %xmm3
+; SSE41-NEXT:    psrlq $33, %xmm1
+; SSE41-NEXT:    pmuludq %xmm4, %xmm1
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psrlq $32, %xmm4
+; SSE41-NEXT:    pmuludq %xmm3, %xmm0
+; SSE41-NEXT:    paddq %xmm1, %xmm0
+; SSE41-NEXT:    psllq $32, %xmm0
 ; SSE41-NEXT:    pmuludq %xmm3, %xmm4
-; SSE41-NEXT:    paddq %xmm0, %xmm4
-; SSE41-NEXT:    psllq $32, %xmm4
-; SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; SSE41-NEXT:    paddq %xmm2, %xmm4
-; SSE41-NEXT:    paddq %xmm4, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    paddq %xmm2, %xmm0
+; SSE41-NEXT:    paddq %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1205,16 +1203,16 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm2
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm3
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm3
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm3
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1228,16 +1226,16 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX2-FALLBACK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm2
 ; AVX2-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm2
-; AVX2-FALLBACK-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm3
-; AVX2-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm3, %xmm3
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX2-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX2-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm3
+; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm3
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
+; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX2-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX2-FALLBACK-NEXT:    retq
 ;
 ; XOP-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1248,16 +1246,16 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; XOP-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
 ; XOP-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
 ; XOP-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; XOP-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; XOP-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; XOP-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; XOP-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; XOP-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; XOP-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOP-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1271,16 +1269,16 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX512F-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpsrlq $32, %xmm1, %xmm2
-; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX512F-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
-; AVX512F-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX512F-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512F-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512F-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1293,16 +1291,16 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX512VL-NEXT:    vpminuq %xmm1, %xmm0, %xmm2
 ; AVX512VL-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX512VL-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX512VL-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512VL-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512VL-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-FALLBACK-LABEL: vec128_i64_unsigned_reg_reg:
@@ -1316,16 +1314,16 @@ define <2 x i64> @vec128_i64_unsigned_reg_reg(<2 x i64> %a1, <2 x i64> %a2) noun
 ; AVX512BW-FALLBACK-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vzeroupper
 ; AVX512BW-FALLBACK-NEXT:    retq
   %t3 = icmp ugt <2 x i64> %a1, %a2
@@ -1351,18 +1349,18 @@ define <2 x i64> @vec128_i64_signed_mem_reg(<2 x i64>* %a1_addr, <2 x i64> %a2)
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
 ; SSE2-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1]
-; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
@@ -1370,22 +1368,22 @@ define <2 x i64> @vec128_i64_signed_mem_reg(<2 x i64>* %a1_addr, <2 x i64> %a2)
 ; SSE2-NEXT:    pandn %xmm0, %xmm5
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psubq %xmm5, %xmm2
-; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psubq %xmm5, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    psrlq $33, %xmm3
+; SSE2-NEXT:    pmuludq %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm3, %xmm0
-; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    pmuludq %xmm4, %xmm0
+; SSE2-NEXT:    paddq %xmm3, %xmm0
 ; SSE2-NEXT:    psllq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
-; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_signed_mem_reg:
@@ -1415,18 +1413,18 @@ define <2 x i64> @vec128_i64_signed_mem_reg(<2 x i64>* %a1_addr, <2 x i64> %a2)
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    psubq %xmm5, %xmm1
-; SSE41-NEXT:    psrlq $1, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    psrlq $1, %xmm0
+; SSE41-NEXT:    psrlq $33, %xmm1
+; SSE41-NEXT:    pmuludq %xmm4, %xmm1
 ; SSE41-NEXT:    movdqa %xmm4, %xmm2
 ; SSE41-NEXT:    psrlq $32, %xmm2
-; SSE41-NEXT:    pmuludq %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrlq $32, %xmm0
+; SSE41-NEXT:    pmuludq %xmm0, %xmm2
+; SSE41-NEXT:    paddq %xmm1, %xmm2
+; SSE41-NEXT:    psllq $32, %xmm2
 ; SSE41-NEXT:    pmuludq %xmm4, %xmm0
+; SSE41-NEXT:    paddq %xmm3, %xmm2
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
-; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm4, %xmm1
-; SSE41-NEXT:    paddq %xmm3, %xmm0
-; SSE41-NEXT:    paddq %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-FALLBACK-LABEL: vec128_i64_signed_mem_reg:
@@ -1438,16 +1436,16 @@ define <2 x i64> @vec128_i64_signed_mem_reg(<2 x i64>* %a1_addr, <2 x i64> %a2)
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm1, %xmm0, %xmm4
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm0, %xmm4
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm0, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_mem_reg:
@@ -1459,16 +1457,16 @@ define <2 x i64> @vec128_i64_signed_mem_reg(<2 x i64>* %a1_addr, <2 x i64> %a2)
 ; AVX2-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm1, %xmm0, %xmm4
 ; AVX2-FALLBACK-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX2-FALLBACK-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
-; AVX2-FALLBACK-NEXT:    vpsrlq $1, %xmm0, %xmm0
-; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX2-FALLBACK-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm0, %xmm4
-; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX2-FALLBACK-NEXT:    vpsrlq $1, %xmm0, %xmm2
+; AVX2-FALLBACK-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX2-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX2-FALLBACK-NEXT:    retq
 ;
 ; XOP-LABEL: vec128_i64_signed_mem_reg:
@@ -1480,16 +1478,16 @@ define <2 x i64> @vec128_i64_signed_mem_reg(<2 x i64>* %a1_addr, <2 x i64> %a2)
 ; XOP-NEXT:    vblendvpd %xmm4, %xmm1, %xmm0, %xmm4
 ; XOP-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; XOP-NEXT:    vpsubq %xmm4, %xmm0, %xmm0
-; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm0
-; XOP-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; XOP-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; XOP-NEXT:    vpsrlq $32, %xmm0, %xmm4
-; XOP-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; XOP-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; XOP-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOP-NEXT:    vpsrlq $1, %xmm0, %xmm2
+; XOP-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
-; XOP-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; XOP-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; XOP-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; XOP-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; XOP-NEXT:    vpsllq $32, %xmm0, %xmm0
+; XOP-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec128_i64_signed_mem_reg:
@@ -1503,16 +1501,16 @@ define <2 x i64> @vec128_i64_signed_mem_reg(<2 x i64>* %a1_addr, <2 x i64> %a2)
 ; AVX512F-NEXT:    vpminsq %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpsrlq $1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm2
-; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX512F-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
-; AVX512F-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX512F-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512F-NEXT:    vpsrlq $1, %xmm0, %xmm2
+; AVX512F-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512F-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512F-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX512F-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1526,16 +1524,16 @@ define <2 x i64> @vec128_i64_signed_mem_reg(<2 x i64>* %a1_addr, <2 x i64> %a2)
 ; AVX512VL-NEXT:    vpminsq %xmm0, %xmm1, %xmm2
 ; AVX512VL-NEXT:    vpmaxsq %xmm0, %xmm1, %xmm0
 ; AVX512VL-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpsrlq $1, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX512VL-NEXT:    vpmuludq %xmm2, %xmm0, %xmm2
-; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm4
-; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsrlq $1, %xmm0, %xmm2
+; AVX512VL-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512VL-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX512VL-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_mem_reg:
@@ -1549,16 +1547,16 @@ define <2 x i64> @vec128_i64_signed_mem_reg(<2 x i64>* %a1_addr, <2 x i64> %a2)
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm0, %zmm1, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm0, %xmm0
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm0, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm0, %xmm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm0, %xmm0
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vzeroupper
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a1 = load <2 x i64>, <2 x i64>* %a1_addr
@@ -1583,18 +1581,18 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr)
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
 ; SSE2-NEXT:    movdqa %xmm5, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1]
-; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pand %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm7, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm5, %xmm4
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
@@ -1602,22 +1600,22 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr)
 ; SSE2-NEXT:    pandn %xmm1, %xmm4
 ; SSE2-NEXT:    por %xmm5, %xmm4
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm2, %xmm5
-; SSE2-NEXT:    pandn %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm5, %xmm2
-; SSE2-NEXT:    psubq %xmm4, %xmm2
-; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    psubq %xmm4, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    psrlq $33, %xmm3
+; SSE2-NEXT:    pmuludq %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    psrlq $32, %xmm1
-; SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; SSE2-NEXT:    paddq %xmm4, %xmm1
+; SSE2-NEXT:    pmuludq %xmm4, %xmm1
+; SSE2-NEXT:    paddq %xmm3, %xmm1
 ; SSE2-NEXT:    psllq $32, %xmm1
-; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE2-NEXT:    paddq %xmm0, %xmm1
-; SSE2-NEXT:    paddq %xmm2, %xmm1
+; SSE2-NEXT:    paddq %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1649,18 +1647,18 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr)
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    psubq %xmm5, %xmm3
-; SSE41-NEXT:    psrlq $1, %xmm3
-; SSE41-NEXT:    movdqa %xmm4, %xmm2
-; SSE41-NEXT:    psrlq $32, %xmm2
-; SSE41-NEXT:    pmuludq %xmm3, %xmm2
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm2
+; SSE41-NEXT:    psrlq $1, %xmm2
+; SSE41-NEXT:    psrlq $33, %xmm3
+; SSE41-NEXT:    pmuludq %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm4, %xmm0
-; SSE41-NEXT:    paddq %xmm2, %xmm0
+; SSE41-NEXT:    pmuludq %xmm2, %xmm0
+; SSE41-NEXT:    paddq %xmm3, %xmm0
 ; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm4, %xmm3
+; SSE41-NEXT:    pmuludq %xmm4, %xmm2
 ; SSE41-NEXT:    paddq %xmm1, %xmm0
-; SSE41-NEXT:    paddq %xmm3, %xmm0
+; SSE41-NEXT:    paddq %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-FALLBACK-LABEL: vec128_i64_signed_reg_mem:
@@ -1672,16 +1670,16 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr)
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_reg_mem:
@@ -1693,16 +1691,16 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr)
 ; AVX2-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
 ; AVX2-FALLBACK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX2-FALLBACK-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX2-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX2-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX2-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX2-FALLBACK-NEXT:    retq
 ;
 ; XOP-LABEL: vec128_i64_signed_reg_mem:
@@ -1714,16 +1712,16 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr)
 ; XOP-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
 ; XOP-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
 ; XOP-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; XOP-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; XOP-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; XOP-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; XOP-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; XOP-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; XOP-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOP-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec128_i64_signed_reg_mem:
@@ -1737,16 +1735,16 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr)
 ; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpsrlq $32, %xmm1, %xmm2
-; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX512F-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
-; AVX512F-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX512F-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512F-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512F-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1760,16 +1758,16 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr)
 ; AVX512VL-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
 ; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX512VL-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX512VL-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512VL-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512VL-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_reg_mem:
@@ -1783,16 +1781,16 @@ define <2 x i64> @vec128_i64_signed_reg_mem(<2 x i64> %a1, <2 x i64>* %a2_addr)
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vzeroupper
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a2 = load <2 x i64>, <2 x i64>* %a2_addr
@@ -1818,18 +1816,18 @@ define <2 x i64> @vec128_i64_signed_mem_mem(<2 x i64>* %a1_addr, <2 x i64>* %a2_
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
 ; SSE2-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT:    movdqa %xmm4, %xmm6
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm3
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1]
 ; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1]
-; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pand %xmm6, %xmm4
+; SSE2-NEXT:    pand %xmm7, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
@@ -1837,32 +1835,32 @@ define <2 x i64> @vec128_i64_signed_mem_mem(<2 x i64>* %a1_addr, <2 x i64>* %a2_
 ; SSE2-NEXT:    pandn %xmm0, %xmm5
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm2, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psubq %xmm5, %xmm2
-; SSE2-NEXT:    psrlq $1, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
+; SSE2-NEXT:    psubq %xmm5, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrlq $32, %xmm4
-; SSE2-NEXT:    pmuludq %xmm2, %xmm4
+; SSE2-NEXT:    psrlq $1, %xmm4
+; SSE2-NEXT:    psrlq $33, %xmm3
+; SSE2-NEXT:    pmuludq %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm3, %xmm0
-; SSE2-NEXT:    paddq %xmm4, %xmm0
+; SSE2-NEXT:    pmuludq %xmm4, %xmm0
+; SSE2-NEXT:    paddq %xmm3, %xmm0
 ; SSE2-NEXT:    psllq $32, %xmm0
-; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pmuludq %xmm2, %xmm4
 ; SSE2-NEXT:    paddq %xmm1, %xmm0
-; SSE2-NEXT:    paddq %xmm2, %xmm0
+; SSE2-NEXT:    paddq %xmm4, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: vec128_i64_signed_mem_mem:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm3
-; SSE41-NEXT:    movdqa (%rsi), %xmm2
+; SSE41-NEXT:    movdqa (%rdi), %xmm2
+; SSE41-NEXT:    movdqa (%rsi), %xmm3
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm2, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
 ; SSE41-NEXT:    pxor %xmm0, %xmm5
-; SSE41-NEXT:    pxor %xmm3, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm0
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpgtd %xmm5, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
@@ -1878,23 +1876,23 @@ define <2 x i64> @vec128_i64_signed_mem_mem(<2 x i64>* %a1_addr, <2 x i64>* %a2_
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
 ; SSE41-NEXT:    pand %xmm6, %xmm0
 ; SSE41-NEXT:    por %xmm5, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm5
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm5
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT:    psubq %xmm5, %xmm2
-; SSE41-NEXT:    psrlq $1, %xmm2
-; SSE41-NEXT:    movdqa %xmm4, %xmm1
-; SSE41-NEXT:    psrlq $32, %xmm1
-; SSE41-NEXT:    pmuludq %xmm2, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; SSE41-NEXT:    psubq %xmm5, %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    psrlq $1, %xmm1
+; SSE41-NEXT:    psrlq $33, %xmm3
+; SSE41-NEXT:    pmuludq %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm4, %xmm0
-; SSE41-NEXT:    paddq %xmm1, %xmm0
-; SSE41-NEXT:    psllq $32, %xmm0
-; SSE41-NEXT:    pmuludq %xmm4, %xmm2
+; SSE41-NEXT:    pmuludq %xmm1, %xmm0
 ; SSE41-NEXT:    paddq %xmm3, %xmm0
+; SSE41-NEXT:    psllq $32, %xmm0
+; SSE41-NEXT:    pmuludq %xmm4, %xmm1
 ; SSE41-NEXT:    paddq %xmm2, %xmm0
+; SSE41-NEXT:    paddq %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-FALLBACK-LABEL: vec128_i64_signed_mem_mem:
@@ -1907,16 +1905,16 @@ define <2 x i64> @vec128_i64_signed_mem_mem(<2 x i64>* %a1_addr, <2 x i64>* %a2_
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
 ; AVX2-FALLBACK-LABEL: vec128_i64_signed_mem_mem:
@@ -1929,16 +1927,16 @@ define <2 x i64> @vec128_i64_signed_mem_mem(<2 x i64>* %a1_addr, <2 x i64>* %a2_
 ; AVX2-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
 ; AVX2-FALLBACK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX2-FALLBACK-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX2-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX2-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX2-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX2-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX2-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX2-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX2-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX2-FALLBACK-NEXT:    retq
 ;
 ; XOP-LABEL: vec128_i64_signed_mem_mem:
@@ -1951,16 +1949,16 @@ define <2 x i64> @vec128_i64_signed_mem_mem(<2 x i64>* %a1_addr, <2 x i64>* %a2_
 ; XOP-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
 ; XOP-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
 ; XOP-NEXT:    vpsubq %xmm4, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; XOP-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; XOP-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; XOP-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; XOP-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; XOP-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; XOP-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; XOP-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; XOP-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; XOP-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; XOP-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOP-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOP-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; XOP-NEXT:    retq
 ;
 ; AVX512F-LABEL: vec128_i64_signed_mem_mem:
@@ -1974,16 +1972,16 @@ define <2 x i64> @vec128_i64_signed_mem_mem(<2 x i64>* %a1_addr, <2 x i64>* %a2_
 ; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpsrlq $32, %xmm1, %xmm2
-; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX512F-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
-; AVX512F-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX512F-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512F-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512F-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512F-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512F-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512F-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512F-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1998,16 +1996,16 @@ define <2 x i64> @vec128_i64_signed_mem_mem(<2 x i64>* %a1_addr, <2 x i64>* %a2_
 ; AVX512VL-NEXT:    vpminsq %xmm1, %xmm0, %xmm2
 ; AVX512VL-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm1
 ; AVX512VL-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrlq $32, %xmm3, %xmm2
-; AVX512VL-NEXT:    vpmuludq %xmm2, %xmm1, %xmm2
-; AVX512VL-NEXT:    vpsrlq $32, %xmm1, %xmm4
-; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512VL-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512VL-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512VL-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512VL-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512VL-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-FALLBACK-LABEL: vec128_i64_signed_mem_mem:
@@ -2021,16 +2019,16 @@ define <2 x i64> @vec128_i64_signed_mem_mem(<2 x i64>* %a1_addr, <2 x i64>* %a2_
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsubq %xmm2, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm1, %xmm4
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm4
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm4
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
 ; AVX512BW-FALLBACK-NEXT:    vzeroupper
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a1 = load <2 x i64>, <2 x i64>* %a1_addr

diff  --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index c4bd5f8cb1f0..357b5b7f59c3 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -477,7 +477,7 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX1-FALLBACK:       # %bb.0:
 ; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm9
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm5
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm6
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm7
@@ -485,31 +485,31 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; AVX1-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; AVX1-FALLBACK-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm6
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
@@ -522,23 +522,23 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm1, %ymm4
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsubq %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm2
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm4
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddq %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-FALLBACK-LABEL: vec256_i64_signed_reg_reg:
 ; XOP-FALLBACK:       # %bb.0:
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOP-FALLBACK-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm9
 ; XOP-FALLBACK-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm5
 ; XOP-FALLBACK-NEXT:    vpcomltq %xmm2, %xmm3, %xmm6
 ; XOP-FALLBACK-NEXT:    vpcomltq %xmm1, %xmm0, %xmm7
@@ -546,31 +546,31 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; XOP-FALLBACK-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; XOP-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; XOP-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; XOP-FALLBACK-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm6
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; XOP-FALLBACK-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; XOP-FALLBACK-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; XOP-FALLBACK-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -578,7 +578,7 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm9
 ; XOPAVX1-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm5
 ; XOPAVX1-NEXT:    vpcomltq %xmm2, %xmm3, %xmm6
 ; XOPAVX1-NEXT:    vpcomltq %xmm1, %xmm0, %xmm7
@@ -586,31 +586,31 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; XOPAVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; XOPAVX1-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; XOPAVX1-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; XOPAVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; XOPAVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; XOPAVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm1, %xmm6
-; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; XOPAVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; XOPAVX1-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; XOPAVX1-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; XOPAVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; XOPAVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -625,16 +625,16 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlq $32, %ymm1, %ymm2
-; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsrlq $32, %ymm3, %ymm4
-; AVX512F-NEXT:    vpmuludq %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vec256_i64_signed_reg_reg:
@@ -646,16 +646,16 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX512VL-NEXT:    vpminsq %ymm1, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlq $32, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
-; AVX512VL-NEXT:    vpsrlq $32, %ymm1, %ymm4
-; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpaddq %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-FALLBACK-LABEL: vec256_i64_signed_reg_reg:
@@ -669,16 +669,16 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm1, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm3, %ymm4
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm4, %ymm1, %ymm4
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    retq
   %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed
   %t4 = select <4 x i1> %t3, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
@@ -711,29 +711,29 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm5, %xmm2, %xmm3, %xmm4
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm8, %xmm2, %xmm3, %xmm3
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,1]
-; AVX1-FALLBACK-NEXT:    vpor %xmm4, %xmm9, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm6
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm6, %xmm1, %xmm6
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm7
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm7, %xmm7
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm7, %xmm6, %xmm6
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm6, %xmm6
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm3, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm3, %xmm7
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm3, %xmm3
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm3, %xmm4
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm5
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm10 = [1,1]
+; AVX1-FALLBACK-NEXT:    vpor %xmm10, %xmm9, %xmm7
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm7, %xmm6
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm6, %xmm5, %xmm6
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm6, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm5, %xmm5
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT:    vpor %xmm10, %xmm8, %xmm6
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm6, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm6, %xmm7
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm4, %xmm7
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm7, %xmm3
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm3, %xmm3
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm6, %xmm4, %xmm4
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm5, %xmm0
 ; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
@@ -749,23 +749,23 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $32, %ymm5, %ymm2
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT:    vpmuludq %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm5, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm5, %ymm3
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm3
+; AVX2-NEXT:    vpaddq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm5, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-FALLBACK-LABEL: vec256_i64_unsigned_reg_reg:
 ; XOP-FALLBACK:       # %bb.0:
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOP-FALLBACK-NEXT:    vpcomgtuq %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpcomgtuq %xmm2, %xmm3, %xmm9
 ; XOP-FALLBACK-NEXT:    vpcomgtuq %xmm1, %xmm0, %xmm5
 ; XOP-FALLBACK-NEXT:    vpcomltuq %xmm2, %xmm3, %xmm6
 ; XOP-FALLBACK-NEXT:    vpcomltuq %xmm1, %xmm0, %xmm7
@@ -773,31 +773,31 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; XOP-FALLBACK-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; XOP-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; XOP-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; XOP-FALLBACK-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm6
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; XOP-FALLBACK-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; XOP-FALLBACK-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; XOP-FALLBACK-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -805,7 +805,7 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT:    vpcomgtuq %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpcomgtuq %xmm2, %xmm3, %xmm9
 ; XOPAVX1-NEXT:    vpcomgtuq %xmm1, %xmm0, %xmm5
 ; XOPAVX1-NEXT:    vpcomltuq %xmm2, %xmm3, %xmm6
 ; XOPAVX1-NEXT:    vpcomltuq %xmm1, %xmm0, %xmm7
@@ -813,31 +813,31 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; XOPAVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; XOPAVX1-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; XOPAVX1-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; XOPAVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; XOPAVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; XOPAVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm1, %xmm6
-; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; XOPAVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; XOPAVX1-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; XOPAVX1-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; XOPAVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; XOPAVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -852,16 +852,16 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX512F-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlq $32, %ymm1, %ymm2
-; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsrlq $32, %ymm3, %ymm4
-; AVX512F-NEXT:    vpmuludq %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vec256_i64_unsigned_reg_reg:
@@ -873,16 +873,16 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX512VL-NEXT:    vpminuq %ymm1, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlq $32, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
-; AVX512VL-NEXT:    vpsrlq $32, %ymm1, %ymm4
-; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpaddq %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-FALLBACK-LABEL: vec256_i64_unsigned_reg_reg:
@@ -896,16 +896,16 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
 ; AVX512BW-FALLBACK-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm1, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm3, %ymm4
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm4, %ymm1, %ymm4
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    retq
   %t3 = icmp ugt <4 x i64> %a1, %a2
   %t4 = select <4 x i1> %t3, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
@@ -926,7 +926,7 @@ define <4 x i64> @vec256_i64_signed_mem_reg(<4 x i64>* %a1_addr, <4 x i64> %a2)
 ; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm9
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm5
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm6
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm7
@@ -934,31 +934,31 @@ define <4 x i64> @vec256_i64_signed_mem_reg(<4 x i64>* %a1_addr, <4 x i64> %a2)
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm5, %xmm1, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm7, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; AVX1-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm0, %xmm7
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; AVX1-FALLBACK-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm0, %xmm7
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm0, %xmm6
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm0, %xmm0
-; AVX1-FALLBACK-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm6, %xmm1
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
@@ -972,16 +972,16 @@ define <4 x i64> @vec256_i64_signed_mem_reg(<4 x i64>* %a1_addr, <4 x i64> %a2)
 ; AVX2-NEXT:    vblendvpd %ymm4, %ymm1, %ymm0, %ymm4
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsubq %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm2
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm4
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddq %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm0, %ymm2
+; AVX2-NEXT:    vpsrlq $33, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-FALLBACK-LABEL: vec256_i64_signed_mem_reg:
@@ -989,7 +989,7 @@ define <4 x i64> @vec256_i64_signed_mem_reg(<4 x i64>* %a1_addr, <4 x i64> %a2)
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; XOP-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
-; XOP-FALLBACK-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm9
 ; XOP-FALLBACK-NEXT:    vpcomgtq %xmm0, %xmm1, %xmm5
 ; XOP-FALLBACK-NEXT:    vpcomltq %xmm2, %xmm3, %xmm6
 ; XOP-FALLBACK-NEXT:    vpcomltq %xmm0, %xmm1, %xmm7
@@ -997,31 +997,31 @@ define <4 x i64> @vec256_i64_signed_mem_reg(<4 x i64>* %a1_addr, <4 x i64> %a2)
 ; XOP-FALLBACK-NEXT:    vblendvpd %xmm5, %xmm1, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vpsubq %xmm7, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; XOP-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm0, %xmm7
+; XOP-FALLBACK-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; XOP-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; XOP-FALLBACK-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm0, %xmm7
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm0, %xmm6
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; XOP-FALLBACK-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; XOP-FALLBACK-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm0, %xmm0
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; XOP-FALLBACK-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm6, %xmm1
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -1030,7 +1030,7 @@ define <4 x i64> @vec256_i64_signed_mem_reg(<4 x i64>* %a1_addr, <4 x i64> %a2)
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm1
 ; XOPAVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
-; XOPAVX1-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm9
 ; XOPAVX1-NEXT:    vpcomgtq %xmm0, %xmm1, %xmm5
 ; XOPAVX1-NEXT:    vpcomltq %xmm2, %xmm3, %xmm6
 ; XOPAVX1-NEXT:    vpcomltq %xmm0, %xmm1, %xmm7
@@ -1038,31 +1038,31 @@ define <4 x i64> @vec256_i64_signed_mem_reg(<4 x i64>* %a1_addr, <4 x i64> %a2)
 ; XOPAVX1-NEXT:    vblendvpd %xmm5, %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vpsubq %xmm7, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; XOPAVX1-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; XOPAVX1-NEXT:    vpsrlq $1, %xmm0, %xmm7
+; XOPAVX1-NEXT:    vpsrlq $33, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; XOPAVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; XOPAVX1-NEXT:    vpmuludq %xmm7, %xmm0, %xmm7
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm0, %xmm6
-; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; XOPAVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm0, %xmm0
-; XOPAVX1-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; XOPAVX1-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; XOPAVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; XOPAVX1-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; XOPAVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; XOPAVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm6, %xmm1
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -1077,16 +1077,16 @@ define <4 x i64> @vec256_i64_signed_mem_reg(<4 x i64>* %a1_addr, <4 x i64> %a2)
 ; AVX512F-NEXT:    vpminsq %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlq $1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm2
-; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsrlq $32, %ymm3, %ymm4
-; AVX512F-NEXT:    vpmuludq %ymm4, %ymm0, %ymm4
-; AVX512F-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlq $1, %ymm0, %ymm2
+; AVX512F-NEXT:    vpsrlq $33, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vec256_i64_signed_mem_reg:
@@ -1099,16 +1099,16 @@ define <4 x i64> @vec256_i64_signed_mem_reg(<4 x i64>* %a1_addr, <4 x i64> %a2)
 ; AVX512VL-NEXT:    vpminsq %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vpmaxsq %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlq $1, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpsrlq $32, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpmuludq %ymm2, %ymm0, %ymm2
-; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm4
-; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpaddq %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlq $1, %ymm0, %ymm2
+; AVX512VL-NEXT:    vpsrlq $33, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; AVX512VL-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-FALLBACK-LABEL: vec256_i64_signed_mem_reg:
@@ -1122,16 +1122,16 @@ define <4 x i64> @vec256_i64_signed_mem_reg(<4 x i64>* %a1_addr, <4 x i64> %a2)
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm0, %zmm1, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
 ; AVX512BW-FALLBACK-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %ymm0, %ymm0
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm0, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm3, %ymm4
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm4, %ymm0, %ymm4
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %ymm0, %ymm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %ymm0, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm0, %ymm0
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a1 = load <4 x i64>, <4 x i64>* %a1_addr
   %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed
@@ -1151,7 +1151,7 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, <4 x i64>* %a2_addr)
 ; AVX1-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX1-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm9
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm5
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm6
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm7
@@ -1159,31 +1159,31 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, <4 x i64>* %a2_addr)
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; AVX1-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; AVX1-FALLBACK-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm6
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
@@ -1197,16 +1197,16 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, <4 x i64>* %a2_addr)
 ; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm1, %ymm4
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsubq %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm2
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm4
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddq %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-FALLBACK-LABEL: vec256_i64_signed_reg_mem:
@@ -1214,7 +1214,7 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, <4 x i64>* %a2_addr)
 ; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm1
 ; XOP-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; XOP-FALLBACK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOP-FALLBACK-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm9
 ; XOP-FALLBACK-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm5
 ; XOP-FALLBACK-NEXT:    vpcomltq %xmm2, %xmm3, %xmm6
 ; XOP-FALLBACK-NEXT:    vpcomltq %xmm1, %xmm0, %xmm7
@@ -1222,31 +1222,31 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, <4 x i64>* %a2_addr)
 ; XOP-FALLBACK-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; XOP-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; XOP-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; XOP-FALLBACK-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm6
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; XOP-FALLBACK-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; XOP-FALLBACK-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; XOP-FALLBACK-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -1255,7 +1255,7 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, <4 x i64>* %a2_addr)
 ; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm1
 ; XOPAVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm9
 ; XOPAVX1-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm5
 ; XOPAVX1-NEXT:    vpcomltq %xmm2, %xmm3, %xmm6
 ; XOPAVX1-NEXT:    vpcomltq %xmm1, %xmm0, %xmm7
@@ -1263,31 +1263,31 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, <4 x i64>* %a2_addr)
 ; XOPAVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; XOPAVX1-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; XOPAVX1-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; XOPAVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; XOPAVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; XOPAVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm1, %xmm6
-; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; XOPAVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; XOPAVX1-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; XOPAVX1-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; XOPAVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; XOPAVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -1302,16 +1302,16 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, <4 x i64>* %a2_addr)
 ; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlq $32, %ymm1, %ymm2
-; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsrlq $32, %ymm3, %ymm4
-; AVX512F-NEXT:    vpmuludq %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vec256_i64_signed_reg_mem:
@@ -1324,16 +1324,16 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, <4 x i64>* %a2_addr)
 ; AVX512VL-NEXT:    vpminsq %ymm1, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlq $32, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
-; AVX512VL-NEXT:    vpsrlq $32, %ymm1, %ymm4
-; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpaddq %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-FALLBACK-LABEL: vec256_i64_signed_reg_mem:
@@ -1347,16 +1347,16 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, <4 x i64>* %a2_addr)
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm1, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm3, %ymm4
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm4, %ymm1, %ymm4
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a2 = load <4 x i64>, <4 x i64>* %a2_addr
   %t3 = icmp sgt <4 x i64> %a1, %a2 ; signed
@@ -1377,7 +1377,7 @@ define <4 x i64> @vec256_i64_signed_mem_mem(<4 x i64>* %a1_addr, <4 x i64>* %a2_
 ; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rsi), %xmm2
 ; AVX1-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm4
+; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm9
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm5
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm6
 ; AVX1-FALLBACK-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm7
@@ -1385,31 +1385,31 @@ define <4 x i64> @vec256_i64_signed_mem_mem(<4 x i64>* %a1_addr, <4 x i64>* %a2_
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; AVX1-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; AVX1-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; AVX1-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; AVX1-FALLBACK-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm6
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; AVX1-FALLBACK-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm5, %xmm5
-; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; AVX1-FALLBACK-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; AVX1-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; AVX1-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; AVX1-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-FALLBACK-NEXT:    retq
 ;
@@ -1424,16 +1424,16 @@ define <4 x i64> @vec256_i64_signed_mem_mem(<4 x i64>* %a1_addr, <4 x i64>* %a2_
 ; AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm1, %ymm4
 ; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsubq %ymm4, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm2
-; AVX2-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm4
-; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
-; AVX2-NEXT:    vpaddq %ymm4, %ymm2, %ymm2
-; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX2-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX2-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX2-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX2-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX2-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX2-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; XOP-FALLBACK-LABEL: vec256_i64_signed_mem_mem:
@@ -1442,7 +1442,7 @@ define <4 x i64> @vec256_i64_signed_mem_mem(<4 x i64>* %a1_addr, <4 x i64>* %a2_
 ; XOP-FALLBACK-NEXT:    vmovdqa 16(%rsi), %xmm2
 ; XOP-FALLBACK-NEXT:    vmovdqa (%rdi), %xmm0
 ; XOP-FALLBACK-NEXT:    vmovdqa 16(%rdi), %xmm3
-; XOP-FALLBACK-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
+; XOP-FALLBACK-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm9
 ; XOP-FALLBACK-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm5
 ; XOP-FALLBACK-NEXT:    vpcomltq %xmm2, %xmm3, %xmm6
 ; XOP-FALLBACK-NEXT:    vpcomltq %xmm1, %xmm0, %xmm7
@@ -1450,31 +1450,31 @@ define <4 x i64> @vec256_i64_signed_mem_mem(<4 x i64>* %a1_addr, <4 x i64>* %a2_
 ; XOP-FALLBACK-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; XOP-FALLBACK-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; XOP-FALLBACK-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; XOP-FALLBACK-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; XOP-FALLBACK-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOP-FALLBACK-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; XOP-FALLBACK-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm1, %xmm6
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; XOP-FALLBACK-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; XOP-FALLBACK-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm5, %xmm5
-; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; XOP-FALLBACK-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; XOP-FALLBACK-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; XOP-FALLBACK-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOP-FALLBACK-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOP-FALLBACK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; XOP-FALLBACK-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOP-FALLBACK-NEXT:    retq
 ;
@@ -1484,7 +1484,7 @@ define <4 x i64> @vec256_i64_signed_mem_mem(<4 x i64>* %a1_addr, <4 x i64>* %a2_
 ; XOPAVX1-NEXT:    vmovdqa 16(%rsi), %xmm2
 ; XOPAVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; XOPAVX1-NEXT:    vmovdqa 16(%rdi), %xmm3
-; XOPAVX1-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm4
+; XOPAVX1-NEXT:    vpcomgtq %xmm2, %xmm3, %xmm9
 ; XOPAVX1-NEXT:    vpcomgtq %xmm1, %xmm0, %xmm5
 ; XOPAVX1-NEXT:    vpcomltq %xmm2, %xmm3, %xmm6
 ; XOPAVX1-NEXT:    vpcomltq %xmm1, %xmm0, %xmm7
@@ -1492,31 +1492,31 @@ define <4 x i64> @vec256_i64_signed_mem_mem(<4 x i64>* %a1_addr, <4 x i64>* %a2_
 ; XOPAVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubq %xmm7, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vblendvpd %xmm6, %xmm3, %xmm2, %xmm6
-; XOPAVX1-NEXT:    vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vblendvpd %xmm9, %xmm3, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpsubq %xmm6, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpsrlq $1, %xmm2, %xmm6
+; XOPAVX1-NEXT:    vpsrlq $1, %xmm1, %xmm7
+; XOPAVX1-NEXT:    vpsrlq $33, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,1]
 ; XOPAVX1-NEXT:    vpor %xmm5, %xmm8, %xmm5
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
-; XOPAVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm1, %xmm6
-; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm6
-; XOPAVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
-; XOPAVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
 ; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm1
-; XOPAVX1-NEXT:    vpor %xmm4, %xmm8, %xmm4
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm4, %xmm5
-; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm2, %xmm5
-; XOPAVX1-NEXT:    vpsrlq $32, %xmm2, %xmm7
-; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm7
-; XOPAVX1-NEXT:    vpaddq %xmm7, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
-; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm5, %xmm3
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm4
+; XOPAVX1-NEXT:    vpmuludq %xmm4, %xmm7, %xmm4
+; XOPAVX1-NEXT:    vpaddq %xmm1, %xmm4, %xmm1
+; XOPAVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm4
+; XOPAVX1-NEXT:    vpsrlq $33, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpor %xmm8, %xmm9, %xmm5
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpsrlq $32, %xmm5, %xmm7
+; XOPAVX1-NEXT:    vpmuludq %xmm7, %xmm6, %xmm7
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm7, %xmm2
+; XOPAVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; XOPAVX1-NEXT:    vpmuludq %xmm5, %xmm6, %xmm5
 ; XOPAVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm6, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
 ; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; XOPAVX1-NEXT:    retq
 ;
@@ -1531,16 +1531,16 @@ define <4 x i64> @vec256_i64_signed_mem_mem(<4 x i64>* %a1_addr, <4 x i64>* %a2_
 ; AVX512F-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512F-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlq $32, %ymm1, %ymm2
-; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT:    vpsrlq $32, %ymm3, %ymm4
-; AVX512F-NEXT:    vpmuludq %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512F-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512F-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: vec256_i64_signed_mem_mem:
@@ -1554,16 +1554,16 @@ define <4 x i64> @vec256_i64_signed_mem_mem(<4 x i64>* %a1_addr, <4 x i64>* %a2_
 ; AVX512VL-NEXT:    vpminsq %ymm1, %ymm0, %ymm2
 ; AVX512VL-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm1
 ; AVX512VL-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlq $32, %ymm3, %ymm2
-; AVX512VL-NEXT:    vpmuludq %ymm2, %ymm1, %ymm2
-; AVX512VL-NEXT:    vpsrlq $32, %ymm1, %ymm4
-; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
-; AVX512VL-NEXT:    vpaddq %ymm4, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512VL-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512VL-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512VL-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512VL-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512VL-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-FALLBACK-LABEL: vec256_i64_signed_mem_mem:
@@ -1577,16 +1577,16 @@ define <4 x i64> @vec256_i64_signed_mem_mem(<4 x i64>* %a1_addr, <4 x i64>* %a2_
 ; AVX512BW-FALLBACK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; AVX512BW-FALLBACK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; AVX512BW-FALLBACK-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm1, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm3, %ymm4
-; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm4, %ymm1, %ymm4
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm2, %ymm4, %ymm2
-; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm2, %ymm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $1, %ymm1, %ymm2
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $33, %ymm1, %ymm1
 ; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpsrlq $32, %ymm3, %ymm4
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm1, %ymm4, %ymm1
+; AVX512BW-FALLBACK-NEXT:    vpsllq $32, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT:    vpmuludq %ymm3, %ymm2, %ymm2
 ; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; AVX512BW-FALLBACK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
 ; AVX512BW-FALLBACK-NEXT:    retq
   %a1 = load <4 x i64>, <4 x i64>* %a1_addr
   %a2 = load <4 x i64>, <4 x i64>* %a2_addr

diff  --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index c3743ca82a11..3d6ef8a3bedc 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -146,16 +146,16 @@ define <8 x i64> @vec512_i64_signed_reg_reg(<8 x i64> %a1, <8 x i64> %a2) nounwi
 ; ALL-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; ALL-NEXT:    vpsubq %zmm2, %zmm1, %zmm1
-; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm1
-; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm2
-; ALL-NEXT:    vpmuludq %zmm2, %zmm1, %zmm2
-; ALL-NEXT:    vpsrlq $32, %zmm1, %zmm4
-; ALL-NEXT:    vpmuludq %zmm3, %zmm4, %zmm4
-; ALL-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
-; ALL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm2
+; ALL-NEXT:    vpsrlq $33, %zmm1, %zmm1
 ; ALL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
-; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm4
+; ALL-NEXT:    vpmuludq %zmm4, %zmm2, %zmm4
+; ALL-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
+; ALL-NEXT:    vpsllq $32, %zmm1, %zmm1
+; ALL-NEXT:    vpmuludq %zmm3, %zmm2, %zmm2
 ; ALL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed
   %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
@@ -178,16 +178,16 @@ define <8 x i64> @vec512_i64_unsigned_reg_reg(<8 x i64> %a1, <8 x i64> %a2) noun
 ; ALL-NEXT:    vpminuq %zmm1, %zmm0, %zmm2
 ; ALL-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm1
 ; ALL-NEXT:    vpsubq %zmm2, %zmm1, %zmm1
-; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm1
-; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm2
-; ALL-NEXT:    vpmuludq %zmm2, %zmm1, %zmm2
-; ALL-NEXT:    vpsrlq $32, %zmm1, %zmm4
-; ALL-NEXT:    vpmuludq %zmm3, %zmm4, %zmm4
-; ALL-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
-; ALL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm2
+; ALL-NEXT:    vpsrlq $33, %zmm1, %zmm1
 ; ALL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
-; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm4
+; ALL-NEXT:    vpmuludq %zmm4, %zmm2, %zmm4
+; ALL-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
+; ALL-NEXT:    vpsllq $32, %zmm1, %zmm1
+; ALL-NEXT:    vpmuludq %zmm3, %zmm2, %zmm2
 ; ALL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %t3 = icmp ugt <8 x i64> %a1, %a2
   %t4 = select <8 x i1> %t3, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
@@ -213,16 +213,16 @@ define <8 x i64> @vec512_i64_signed_mem_reg(<8 x i64>* %a1_addr, <8 x i64> %a2)
 ; ALL-NEXT:    vpminsq %zmm0, %zmm1, %zmm2
 ; ALL-NEXT:    vpmaxsq %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
-; ALL-NEXT:    vpsrlq $1, %zmm0, %zmm0
-; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm2
-; ALL-NEXT:    vpmuludq %zmm2, %zmm0, %zmm2
-; ALL-NEXT:    vpsrlq $32, %zmm0, %zmm4
-; ALL-NEXT:    vpmuludq %zmm3, %zmm4, %zmm4
-; ALL-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
-; ALL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; ALL-NEXT:    vpsrlq $1, %zmm0, %zmm2
+; ALL-NEXT:    vpsrlq $33, %zmm0, %zmm0
 ; ALL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm0
-; ALL-NEXT:    vpaddq %zmm1, %zmm2, %zmm1
+; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm4
+; ALL-NEXT:    vpmuludq %zmm4, %zmm2, %zmm4
+; ALL-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
+; ALL-NEXT:    vpsllq $32, %zmm0, %zmm0
+; ALL-NEXT:    vpmuludq %zmm3, %zmm2, %zmm2
 ; ALL-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %a1 = load <8 x i64>, <8 x i64>* %a1_addr
   %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed
@@ -247,16 +247,16 @@ define <8 x i64> @vec512_i64_signed_reg_mem(<8 x i64> %a1, <8 x i64>* %a2_addr)
 ; ALL-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; ALL-NEXT:    vpsubq %zmm2, %zmm1, %zmm1
-; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm1
-; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm2
-; ALL-NEXT:    vpmuludq %zmm2, %zmm1, %zmm2
-; ALL-NEXT:    vpsrlq $32, %zmm1, %zmm4
-; ALL-NEXT:    vpmuludq %zmm3, %zmm4, %zmm4
-; ALL-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
-; ALL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm2
+; ALL-NEXT:    vpsrlq $33, %zmm1, %zmm1
 ; ALL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
-; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm4
+; ALL-NEXT:    vpmuludq %zmm4, %zmm2, %zmm4
+; ALL-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
+; ALL-NEXT:    vpsllq $32, %zmm1, %zmm1
+; ALL-NEXT:    vpmuludq %zmm3, %zmm2, %zmm2
 ; ALL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %a2 = load <8 x i64>, <8 x i64>* %a2_addr
   %t3 = icmp sgt <8 x i64> %a1, %a2 ; signed
@@ -282,16 +282,16 @@ define <8 x i64> @vec512_i64_signed_mem_mem(<8 x i64>* %a1_addr, <8 x i64>* %a2_
 ; ALL-NEXT:    vpminsq %zmm1, %zmm0, %zmm2
 ; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm1
 ; ALL-NEXT:    vpsubq %zmm2, %zmm1, %zmm1
-; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm1
-; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm2
-; ALL-NEXT:    vpmuludq %zmm2, %zmm1, %zmm2
-; ALL-NEXT:    vpsrlq $32, %zmm1, %zmm4
-; ALL-NEXT:    vpmuludq %zmm3, %zmm4, %zmm4
-; ALL-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
-; ALL-NEXT:    vpsllq $32, %zmm2, %zmm2
+; ALL-NEXT:    vpsrlq $1, %zmm1, %zmm2
+; ALL-NEXT:    vpsrlq $33, %zmm1, %zmm1
 ; ALL-NEXT:    vpmuludq %zmm3, %zmm1, %zmm1
-; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; ALL-NEXT:    vpsrlq $32, %zmm3, %zmm4
+; ALL-NEXT:    vpmuludq %zmm4, %zmm2, %zmm4
+; ALL-NEXT:    vpaddq %zmm1, %zmm4, %zmm1
+; ALL-NEXT:    vpsllq $32, %zmm1, %zmm1
+; ALL-NEXT:    vpmuludq %zmm3, %zmm2, %zmm2
 ; ALL-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; ALL-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; ALL-NEXT:    retq
   %a1 = load <8 x i64>, <8 x i64>* %a1_addr
   %a2 = load <8 x i64>, <8 x i64>* %a2_addr


        


More information about the llvm-commits mailing list