[llvm] f8d9097 - [DAGCombiner] `combineShuffleOfSplatVal()`: try to canonicalize to a splat shuffle

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Sun Jan 15 10:12:27 PST 2023


Author: Roman Lebedev
Date: 2023-01-15T21:11:33+03:00
New Revision: f8d9097168b73e3befd94ac4e16cbc5f944afcec

URL: https://github.com/llvm/llvm-project/commit/f8d9097168b73e3befd94ac4e16cbc5f944afcec
DIFF: https://github.com/llvm/llvm-project/commit/f8d9097168b73e3befd94ac4e16cbc5f944afcec.diff

LOG: [DAGCombiner] `combineShuffleOfSplatVal()`: try to canonicalize to a splat shuffle

As noted in https://reviews.llvm.org/D141778#inline-1369900,
we fail to produce splat shuffles from certain sequences
of shuffles, that may have non-shuffles in the middle of seq.

There is a big pitfail to avoid here: just because `isSplatValue()`
says that all demanded elements are splat, we can't pick any random
one of them, because some of them could be undef! We must ignore those!

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll
    llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
    llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
    llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
    llvm/test/CodeGen/X86/vec_smulo.ll
    llvm/test/CodeGen/X86/vec_umulo.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0e9e423767d4a..af27187bb97c8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23340,9 +23340,50 @@ static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
 // the masks of the shuffles.
 static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf,
                                         SelectionDAG &DAG) {
+  EVT VT = Shuf->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+
   if (!Shuf->getOperand(1).isUndef())
     return SDValue();
 
+  // See if this unary non-splat shuffle actually *is* a splat shuffle,
+  // in disguise, with all demanded elements being identical.
+  // FIXME: this can be done per-operand.
+  if (!Shuf->isSplat()) {
+    APInt DemandedElts(NumElts, 0);
+    for (int Idx : Shuf->getMask()) {
+      if (Idx < 0)
+        continue; // Ignore sentinel indices.
+      assert((unsigned)Idx < NumElts && "Out-of-bounds shuffle indice?");
+      DemandedElts.setBit(Idx);
+    }
+    assert(DemandedElts.countPopulation() > 1 && "Is a splat shuffle already?");
+    APInt UndefElts;
+    if (DAG.isSplatValue(Shuf->getOperand(0), DemandedElts, UndefElts)) {
+      // Even if all demanded elements are splat, some of them could be undef.
+      // Which lowest demanded element is *not* known-undef?
+      unsigned MinNonUndefIdx = ~0U;
+      for (int Idx : Shuf->getMask()) {
+        if (Idx < 0 || UndefElts[Idx])
+          continue; // Ignore sentinel indices, and undef elements.
+        MinNonUndefIdx = std::min<unsigned>(Idx, MinNonUndefIdx);
+      }
+      assert(MinNonUndefIdx < NumElts && "Undef shuffle?");
+      SmallVector<int, 8> SplatMask(Shuf->getMask().begin(),
+                                    Shuf->getMask().end());
+      for (int &Idx : SplatMask) {
+        if (Idx < 0)
+          continue; // Passthrough sentinel indices.
+        // Otherwise, just pick the lowest demanded non-undef element.
+        // Or sentinel undef, if we know we'd pick a known-undef element.
+        Idx = UndefElts[Idx] ? -1 : MinNonUndefIdx;
+      }
+      assert(SplatMask != Shuf->getMask() && "Expected mask to change!");
+      return DAG.getVectorShuffle(VT, SDLoc(Shuf), Shuf->getOperand(0),
+                                  Shuf->getOperand(1), SplatMask);
+    }
+  }
+
   // If the inner operand is a known splat with no undefs, just return that directly.
   // TODO: Create DemandedElts mask from Shuf's mask.
   // TODO: Allow undef elements and merge with the shuffle code below.

diff  --git a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll
index a22eb055630e1..24d3f8273d79c 100644
--- a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll
+++ b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll
@@ -573,14 +573,12 @@ define void @test_v2i64_none() {
 ; CHECK-AIX-32-P8:       # %bb.0: # %entry
 ; CHECK-AIX-32-P8-NEXT:    lfiwzx f0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    xxspltw vs0, vs0, 1
-; CHECK-AIX-32-P8-NEXT:    xxmrghw vs0, vs0, vs0
 ; CHECK-AIX-32-P8-NEXT:    stxvw4x vs0, 0, r3
 ; CHECK-AIX-32-P8-NEXT:    blr
 ;
 ; CHECK-AIX-32-P9-LABEL: test_v2i64_none:
 ; CHECK-AIX-32-P9:       # %bb.0: # %entry
 ; CHECK-AIX-32-P9-NEXT:    lxvwsx vs0, 0, r3
-; CHECK-AIX-32-P9-NEXT:    xxmrghw vs0, vs0, vs0
 ; CHECK-AIX-32-P9-NEXT:    stxv vs0, 0(r3)
 ; CHECK-AIX-32-P9-NEXT:    blr
 entry:

diff  --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index aeb39a1e242d9..59c64a3de71d0 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -621,14 +621,15 @@ define <4 x i32> @test_srem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm4, %xmm3
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,3,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm4, %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -1097,7 +1098,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,3067833783,1,3067833783]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <3067833783,u,1,u>
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -1109,14 +1110,15 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-NEXT:    pmuludq %xmm1, %xmm3
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,3,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm5, %xmm1
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
 ; CHECK-SSE2-NEXT:    por %xmm4, %xmm3
-; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; CHECK-SSE2-NEXT:    pxor %xmm5, %xmm3
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
 ; CHECK-SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
 ; CHECK-SSE2-NEXT:    pxor %xmm3, %xmm1

diff  --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index fd496780f535d..2092de25d5d0e 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -137,14 +137,12 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; SSE2-NEXT:    movd %edx, %xmm0
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE2-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = <683,1463,819,u>
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = <683,u,819,u>
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT:    movl $1463, %eax # imm = 0x5B7
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    pmuludq %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3

diff  --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 85e10f209f2b3..7846f0e8f6e03 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -159,18 +159,19 @@ define <4 x i32> @test_urem_even_allones_eq(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_eq:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -236,18 +237,19 @@ define <4 x i32> @test_urem_even_allones_ne(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_allones_ne:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    psrld $31, %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -534,18 +536,19 @@ define <4 x i32> @test_urem_even_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_poweroftwo:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq
@@ -965,18 +968,19 @@ define <4 x i32> @test_urem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_urem_even_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-SSE2-NEXT:    pmuludq %xmm2, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
-; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
+; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-SSE2-NEXT:    por %xmm2, %xmm0
-; CHECK-SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE2-NEXT:    por %xmm4, %xmm0
+; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm0
 ; CHECK-SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE2-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index fef44e062fd3b..094dd4b9a1aca 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -423,23 +423,23 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movq %rdi, %rax
 ; SSE2-NEXT:    movd %r8d, %xmm0
-; SSE2-NEXT:    movd %ecx, %xmm1
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movd %edx, %xmm0
-; SSE2-NEXT:    movd %esi, %xmm2
+; SSE2-NEXT:    movd %ecx, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd %edx, %xmm0
+; SSE2-NEXT:    movd %esi, %xmm1
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
 ; SSE2-NEXT:    movd %r9d, %xmm0
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    pmuludq %xmm6, %xmm0
@@ -448,70 +448,72 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-NEXT:    pxor %xmm7, %xmm7
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm8
-; SSE2-NEXT:    pand %xmm2, %xmm8
+; SSE2-NEXT:    pand %xmm1, %xmm8
 ; SSE2-NEXT:    pxor %xmm9, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm9
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm9
 ; SSE2-NEXT:    pand %xmm5, %xmm9
 ; SSE2-NEXT:    paddd %xmm8, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pmuludq %xmm5, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm2[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pmuludq %xmm5, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[1,3,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    pmuludq %xmm8, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSE2-NEXT:    psubd %xmm9, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE2-NEXT:    movdqa %xmm2, (%rcx)
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSE2-NEXT:    movdqa %xmm1, (%rcx)
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm1
 ; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm1
 ; SSE2-NEXT:    pxor %xmm8, %xmm8
 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm8
 ; SSE2-NEXT:    pand %xmm3, %xmm8
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm7
 ; SSE2-NEXT:    pand %xmm6, %xmm7
 ; SSE2-NEXT:    paddd %xmm8, %xmm7
-; SSE2-NEXT:    pmuludq %xmm4, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; SSE2-NEXT:    psubd %xmm7, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    movq %xmm0, 16(%rcx)
 ; SSE2-NEXT:    psrad $31, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm5, %xmm0
 ; SSE2-NEXT:    movq %xmm0, 16(%rdi)
-; SSE2-NEXT:    movdqa %xmm2, (%rdi)
+; SSE2-NEXT:    movdqa %xmm1, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: smulo_v6i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movq %rdi, %rax
 ; SSSE3-NEXT:    movd %r8d, %xmm0
-; SSSE3-NEXT:    movd %ecx, %xmm1
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT:    movd %edx, %xmm0
-; SSSE3-NEXT:    movd %esi, %xmm2
+; SSSE3-NEXT:    movd %ecx, %xmm2
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd %edx, %xmm0
+; SSSE3-NEXT:    movd %esi, %xmm1
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm1[0]
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
 ; SSSE3-NEXT:    movd %r9d, %xmm0
-; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
 ; SSSE3-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    pmuludq %xmm6, %xmm0
@@ -520,47 +522,49 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSSE3-NEXT:    pxor %xmm7, %xmm7
 ; SSSE3-NEXT:    pxor %xmm8, %xmm8
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm8
-; SSSE3-NEXT:    pand %xmm2, %xmm8
+; SSSE3-NEXT:    pand %xmm1, %xmm8
 ; SSSE3-NEXT:    pxor %xmm9, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm9
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm9
 ; SSSE3-NEXT:    pand %xmm5, %xmm9
 ; SSSE3-NEXT:    paddd %xmm8, %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pmuludq %xmm5, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm2[1,3,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pmuludq %xmm5, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[1,3,2,3]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSSE3-NEXT:    pmuludq %xmm8, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm5[1,3,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
 ; SSSE3-NEXT:    psubd %xmm9, %xmm10
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSSE3-NEXT:    movdqa %xmm2, (%rcx)
-; SSSE3-NEXT:    psrad $31, %xmm2
-; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm2
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; SSSE3-NEXT:    movdqa %xmm1, (%rcx)
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm1
 ; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
-; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm1
 ; SSSE3-NEXT:    pxor %xmm8, %xmm8
 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm8
 ; SSSE3-NEXT:    pand %xmm3, %xmm8
 ; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm7
 ; SSSE3-NEXT:    pand %xmm6, %xmm7
 ; SSSE3-NEXT:    paddd %xmm8, %xmm7
-; SSSE3-NEXT:    pmuludq %xmm4, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSSE3-NEXT:    pmuludq %xmm3, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
 ; SSSE3-NEXT:    psubd %xmm7, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSSE3-NEXT:    movq %xmm0, 16(%rcx)
 ; SSSE3-NEXT:    psrad $31, %xmm0
 ; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm0
 ; SSSE3-NEXT:    pxor %xmm5, %xmm0
 ; SSSE3-NEXT:    movq %xmm0, 16(%rdi)
-; SSSE3-NEXT:    movdqa %xmm2, (%rdi)
+; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: smulo_v6i32:
@@ -569,43 +573,47 @@ define <6 x i32> @smulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSE41-NEXT:    movd %esi, %xmm2
 ; SSE41-NEXT:    pinsrd $1, %edx, %xmm2
 ; SSE41-NEXT:    pinsrd $2, %ecx, %xmm2
-; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pmuldq %xmm2, %xmm1
+; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    movdqa %xmm0, %xmm1
+; SSE41-NEXT:    pmuldq %xmm2, %xmm0
 ; SSE41-NEXT:    pinsrd $3, %r8d, %xmm2
+; SSE41-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
 ; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    movd %r9d, %xmm4
 ; SSE41-NEXT:    movdqa %xmm4, %xmm5
 ; SSE41-NEXT:    pmuldq %xmm3, %xmm4
-; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
-; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm5
+; SSE41-NEXT:    pinsrd $1, %ecx, %xmm3
+; SSE41-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; SSE41-NEXT:    pinsrd $1, %edx, %xmm5
 ; SSE41-NEXT:    pmulld %xmm3, %xmm5
-; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm0
-; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm1
+; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SSE41-NEXT:    movd %ecx, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
+; SSE41-NEXT:    movd %edx, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,0,0,0]
 ; SSE41-NEXT:    pmuldq %xmm3, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
-; SSE41-NEXT:    pmulld %xmm2, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, (%rcx)
-; SSE41-NEXT:    psrad $31, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT:    pxor %xmm1, %xmm0
-; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE41-NEXT:    pmuldq %xmm2, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; SSE41-NEXT:    movq %xmm5, 16(%rcx)
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
+; SSE41-NEXT:    movq %xmm5, 16(%rsi)
 ; SSE41-NEXT:    psrad $31, %xmm5
-; SSE41-NEXT:    pcmpeqd %xmm2, %xmm5
-; SSE41-NEXT:    pxor %xmm1, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE41-NEXT:    pxor %xmm3, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pmuldq %xmm4, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
+; SSE41-NEXT:    pmulld %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, (%rsi)
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE41-NEXT:    pxor %xmm3, %xmm1
 ; SSE41-NEXT:    movq %xmm5, 16(%rdi)
-; SSE41-NEXT:    movdqa %xmm0, (%rdi)
+; SSE41-NEXT:    movdqa %xmm1, (%rdi)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: smulo_v6i32:

diff  --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index f7d4deeb85411..e556e7a7d6e7b 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -394,8 +394,8 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = mem[0,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = mem[0,0,0,0]
 ; SSE2-NEXT:    pmuludq %xmm2, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
@@ -444,8 +444,8 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = mem[0,0,0,0]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = mem[0,0,0,0]
 ; SSSE3-NEXT:    pmuludq %xmm2, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
@@ -462,45 +462,49 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; SSE41-LABEL: umulo_v6i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movq %rdi, %rax
+; SSE41-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT:    movd %r9d, %xmm1
+; SSE41-NEXT:    movdqa %xmm1, %xmm0
+; SSE41-NEXT:    pmuludq %xmm2, %xmm1
+; SSE41-NEXT:    pinsrd $1, %edi, %xmm2
+; SSE41-NEXT:    movl {{[0-9]+}}(%rsp), %r9d
+; SSE41-NEXT:    pinsrd $1, %r9d, %xmm0
+; SSE41-NEXT:    pmulld %xmm2, %xmm0
 ; SSE41-NEXT:    movd %esi, %xmm2
 ; SSE41-NEXT:    pinsrd $1, %edx, %xmm2
 ; SSE41-NEXT:    pinsrd $2, %ecx, %xmm2
-; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pmuludq %xmm2, %xmm1
-; SSE41-NEXT:    pinsrd $3, %r8d, %xmm2
 ; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; SSE41-NEXT:    movd %r9d, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm5
-; SSE41-NEXT:    pmuludq %xmm3, %xmm4
 ; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
-; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm5
-; SSE41-NEXT:    pmulld %xmm3, %xmm5
-; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm3
+; SSE41-NEXT:    movdqa %xmm3, %xmm4
+; SSE41-NEXT:    pmuludq %xmm2, %xmm3
+; SSE41-NEXT:    pinsrd $3, %r8d, %xmm2
+; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm4
 ; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE41-NEXT:    pmuludq %xmm3, %xmm6
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm1
+; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSE41-NEXT:    pmuludq %xmm5, %xmm6
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5],xmm6[6,7]
+; SSE41-NEXT:    pxor %xmm5, %xmm5
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm3
 ; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
-; SSE41-NEXT:    pxor %xmm6, %xmm1
-; SSE41-NEXT:    movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
-; SSE41-NEXT:    movd {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; SSE41-NEXT:    pxor %xmm6, %xmm3
+; SSE41-NEXT:    movd %edi, %xmm7
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0]
+; SSE41-NEXT:    movd %r9d, %xmm8
+; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0]
 ; SSE41-NEXT:    pmuludq %xmm7, %xmm8
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3],xmm4[4,5],xmm8[6,7]
-; SSE41-NEXT:    pcmpeqd %xmm3, %xmm4
-; SSE41-NEXT:    pxor %xmm6, %xmm4
-; SSE41-NEXT:    pmulld %xmm2, %xmm0
-; SSE41-NEXT:    movq %xmm5, 16(%rcx)
-; SSE41-NEXT:    movdqa %xmm0, (%rcx)
-; SSE41-NEXT:    movq %xmm4, 16(%rdi)
-; SSE41-NEXT:    movdqa %xmm1, (%rdi)
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7]
+; SSE41-NEXT:    pcmpeqd %xmm5, %xmm1
+; SSE41-NEXT:    pxor %xmm6, %xmm1
+; SSE41-NEXT:    pmulld %xmm2, %xmm4
+; SSE41-NEXT:    movq %xmm0, 16(%rcx)
+; SSE41-NEXT:    movdqa %xmm4, (%rcx)
+; SSE41-NEXT:    movq %xmm1, 16(%rax)
+; SSE41-NEXT:    movdqa %xmm3, (%rax)
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: umulo_v6i32:


        


More information about the llvm-commits mailing list