[llvm] r369459 - [DAGCombiner][X86] Teach visitCONCAT_VECTORS to combine (concat_vectors (concat_vectors X, Y), undef)) -> (concat_vectors X, Y, undef, undef)

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 20 15:12:50 PDT 2019


Author: ctopper
Date: Tue Aug 20 15:12:50 2019
New Revision: 369459

URL: http://llvm.org/viewvc/llvm-project?rev=369459&view=rev
Log:
[DAGCombiner][X86] Teach visitCONCAT_VECTORS to combine (concat_vectors (concat_vectors X, Y), undef)) -> (concat_vectors X, Y, undef, undef)

I also had to add a new combine to X86's combineExtractSubvector to prevent a regression.

This helps our vXi1 code see the full concat operation and allow it optimize undef to a zero if there is already a zero in the concat. This helped us use a movzx instead of an AND in some of the tests. In those tests, one concat comes from SelectionDAGBuilder and the second comes from type legalization of v4i1->i4 bitcasts which uses an additional concat. Though these changes weren't my original motivation.

I'm looking at making X86ISelLowering's narrowShuffle emit a concat_vectors instead of an insert_subvector since concat_vectors is more canonical during early DAG combine. This patch helps prevent a regression from my experiments with that.

Differential Revision: https://reviews.llvm.org/D66456

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
    llvm/trunk/test/CodeGen/X86/oddshuffles.ll
    llvm/trunk/test/CodeGen/X86/vec_saddo.ll
    llvm/trunk/test/CodeGen/X86/vec_smulo.ll
    llvm/trunk/test/CodeGen/X86/vec_ssubo.ll
    llvm/trunk/test/CodeGen/X86/vec_uaddo.ll
    llvm/trunk/test/CodeGen/X86/vec_umulo.ll
    llvm/trunk/test/CodeGen/X86/vec_usubo.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=369459&r1=369458&r2=369459&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Tue Aug 20 15:12:50 2019
@@ -17687,6 +17687,15 @@ SDValue DAGCombiner::visitCONCAT_VECTORS
     SDValue In = N->getOperand(0);
     assert(In.getValueType().isVector() && "Must concat vectors");
 
+    // If the input is a concat_vectors, just make a larger concat by padding
+    // with smaller undefs.
+    if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) {
+      unsigned NumOps = N->getNumOperands() * In.getNumOperands();
+      SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end());
+      Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType()));
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
+    }
+
     SDValue Scalar = peekThroughOneUseBitcasts(In);
 
     // concat_vectors(scalar_to_vector(scalar), undef) ->

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=369459&r1=369458&r2=369459&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Aug 20 15:12:50 2019
@@ -44504,6 +44504,20 @@ static SDValue combineExtractSubvector(S
     }
   }
 
+  // If we are extracting from an insert into a zero vector, replace with a
+  // smaller insert into zero if we don't access less than the original
+  // subvector. Don't do this for i1 vectors.
+  if (VT.getVectorElementType() != MVT::i1 &&
+      InVec.getOpcode() == ISD::INSERT_SUBVECTOR && IdxVal == 0 &&
+      InVec.hasOneUse() && isNullConstant(InVec.getOperand(2)) &&
+      ISD::isBuildVectorAllZeros(InVec.getOperand(0).getNode()) &&
+      InVec.getOperand(1).getValueSizeInBits() <= VT.getSizeInBits()) {
+    SDLoc DL(N);
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                       getZeroVector(VT, Subtarget, DAG, DL),
+                       InVec.getOperand(1), InVec.getOperand(2));
+  }
+
   // If we're extracting from a broadcast then we're better off just
   // broadcasting to the smaller type directly, assuming this is the only use.
   // As its a broadcast we don't care about the extraction index.

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll?rev=369459&r1=369458&r2=369459&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll Tue Aug 20 15:12:50 2019
@@ -2684,7 +2684,6 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2711,7 +2710,6 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2741,7 +2739,6 @@ define zeroext i4 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2773,7 +2770,6 @@ define zeroext i4 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2805,7 +2801,6 @@ define zeroext i4 @test_vpcmpeqq_v2i1_v4
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2836,7 +2831,6 @@ define zeroext i4 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7526,7 +7520,6 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7553,7 +7546,6 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7583,7 +7575,6 @@ define zeroext i4 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7615,7 +7606,6 @@ define zeroext i4 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7647,7 +7637,6 @@ define zeroext i4 @test_vpcmpsgtq_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7678,7 +7667,6 @@ define zeroext i4 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12428,7 +12416,6 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12455,7 +12442,6 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12485,7 +12471,6 @@ define zeroext i4 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12517,7 +12502,6 @@ define zeroext i4 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12549,7 +12533,6 @@ define zeroext i4 @test_vpcmpsgeq_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12580,7 +12563,6 @@ define zeroext i4 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17350,7 +17332,6 @@ define zeroext i4 @test_vpcmpultq_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17377,7 +17358,6 @@ define zeroext i4 @test_vpcmpultq_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17407,7 +17387,6 @@ define zeroext i4 @test_masked_vpcmpultq
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17439,7 +17418,6 @@ define zeroext i4 @test_masked_vpcmpultq
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17471,7 +17449,6 @@ define zeroext i4 @test_vpcmpultq_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17502,7 +17479,6 @@ define zeroext i4 @test_masked_vpcmpultq
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21219,7 +21195,6 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21246,7 +21221,6 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21274,7 +21248,6 @@ define zeroext i4 @test_vcmpoeqpd_v2i1_v
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21305,7 +21278,6 @@ define zeroext i4 @test_masked_vcmpoeqpd
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21336,7 +21308,6 @@ define zeroext i4 @test_masked_vcmpoeqpd
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21368,7 +21339,6 @@ define zeroext i4 @test_masked_vcmpoeqpd
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    andl $3, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:

Modified: llvm/trunk/test/CodeGen/X86/oddshuffles.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/oddshuffles.ll?rev=369459&r1=369458&r2=369459&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/oddshuffles.ll (original)
+++ llvm/trunk/test/CodeGen/X86/oddshuffles.ll Tue Aug 20 15:12:50 2019
@@ -1513,34 +1513,34 @@ define void @interleave_24i32_in(<24 x i
 ; AVX1-LABEL: interleave_24i32_in:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovupd (%rsi), %ymm0
-; AVX1-NEXT:    vmovups 16(%rcx), %xmm1
-; AVX1-NEXT:    vmovups (%rdx), %xmm2
-; AVX1-NEXT:    vmovups 16(%rdx), %xmm3
-; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm3[3,0],xmm1[3,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[2,1],xmm4[0,2]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[1,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,2]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm0[1,1,3,3]
-; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
+; AVX1-NEXT:    vmovups (%rdx), %xmm1
+; AVX1-NEXT:    vmovups 16(%rdx), %xmm2
 ; AVX1-NEXT:    vmovups (%rsi), %xmm3
-; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm2[2,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm2[1,1],xmm4[0,2]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm3[0,0]
-; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[2,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm1[2,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
 ; AVX1-NEXT:    vpermilps {{.*#+}} xmm3 = mem[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
+; AVX1-NEXT:    vmovups 16(%rcx), %xmm3
+; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm2[3,0],xmm3[3,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm4 = xmm3[2,1],xmm4[0,2]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,0]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,2]
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm3 = ymm0[1,1,3,3]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
 ; AVX1-NEXT:    vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2]
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7]
 ; AVX1-NEXT:    vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
 ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
 ; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX1-NEXT:    vmovups %ymm2, (%rdi)
-; AVX1-NEXT:    vmovups %ymm1, 64(%rdi)
+; AVX1-NEXT:    vmovups %ymm2, 64(%rdi)
+; AVX1-NEXT:    vmovups %ymm1, (%rdi)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -1549,17 +1549,17 @@ define void @interleave_24i32_in(<24 x i
 ; AVX2-SLOW-NEXT:    vmovups (%rsi), %ymm0
 ; AVX2-SLOW-NEXT:    vmovups (%rdx), %ymm1
 ; AVX2-SLOW-NEXT:    vmovups (%rcx), %ymm2
-; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm3 = ymm2[2,1,3,3]
-; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7]
-; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm3 = mem[1,0,2,2]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
-; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[0,3,3,3]
+; AVX2-SLOW-NEXT:    vbroadcastsd (%rcx), %ymm4
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} xmm4 = mem[1,0,2,2]
-; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
+; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7]
-; AVX2-SLOW-NEXT:    vbroadcastsd (%rcx), %ymm5
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
@@ -1567,8 +1567,8 @@ define void @interleave_24i32_in(<24 x i
 ; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX2-SLOW-NEXT:    vmovups %ymm4, (%rdi)
-; AVX2-SLOW-NEXT:    vmovups %ymm3, 64(%rdi)
+; AVX2-SLOW-NEXT:    vmovups %ymm4, 64(%rdi)
+; AVX2-SLOW-NEXT:    vmovups %ymm3, (%rdi)
 ; AVX2-SLOW-NEXT:    vzeroupper
 ; AVX2-SLOW-NEXT:    retq
 ;
@@ -1577,27 +1577,27 @@ define void @interleave_24i32_in(<24 x i
 ; AVX2-FAST-NEXT:    vmovups (%rsi), %ymm0
 ; AVX2-FAST-NEXT:    vmovups (%rdx), %ymm1
 ; AVX2-FAST-NEXT:    vmovups (%rcx), %ymm2
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = [5,6,5,6,5,6,7,7]
+; AVX2-FAST-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2]
+; AVX2-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm3, %ymm3
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[0,3,3,3]
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
+; AVX2-FAST-NEXT:    vbroadcastsd (%rcx), %ymm4
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2]
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7]
+; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm4, %ymm4
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
-; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
-; AVX2-FAST-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2]
-; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
-; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-NEXT:    vbroadcastsd (%rcx), %ymm1
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
+; AVX2-FAST-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-NEXT:    vmovups %ymm0, (%rdi)
-; AVX2-FAST-NEXT:    vmovups %ymm2, 32(%rdi)
-; AVX2-FAST-NEXT:    vmovups %ymm3, 64(%rdi)
+; AVX2-FAST-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX2-FAST-NEXT:    vmovups %ymm4, 64(%rdi)
+; AVX2-FAST-NEXT:    vmovups %ymm3, (%rdi)
 ; AVX2-FAST-NEXT:    vzeroupper
 ; AVX2-FAST-NEXT:    retq
 ;
@@ -1605,32 +1605,32 @@ define void @interleave_24i32_in(<24 x i
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vmovupd (%rsi), %ymm0
 ; XOP-NEXT:    vmovups (%rcx), %ymm1
-; XOP-NEXT:    vmovups 16(%rcx), %xmm2
-; XOP-NEXT:    vmovups (%rdx), %xmm3
-; XOP-NEXT:    vmovups 16(%rdx), %xmm4
-; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0]
-; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2]
-; XOP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0]
-; XOP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2]
-; XOP-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; XOP-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
-; XOP-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; XOP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
+; XOP-NEXT:    vmovups (%rdx), %xmm2
+; XOP-NEXT:    vmovups 16(%rdx), %xmm3
 ; XOP-NEXT:    vmovups (%rsi), %xmm4
-; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0]
-; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2]
-; XOP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0]
-; XOP-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1]
-; XOP-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0]
+; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2]
+; XOP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0]
+; XOP-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1]
+; XOP-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
 ; XOP-NEXT:    vpermilps {{.*#+}} xmm4 = mem[0,1,0,1]
 ; XOP-NEXT:    vinsertf128 $1, %xmm4, %ymm4, %ymm4
+; XOP-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
+; XOP-NEXT:    vmovups 16(%rcx), %xmm4
+; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0]
+; XOP-NEXT:    vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2]
+; XOP-NEXT:    vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0]
+; XOP-NEXT:    vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2]
+; XOP-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; XOP-NEXT:    vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3]
+; XOP-NEXT:    vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
 ; XOP-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
 ; XOP-NEXT:    vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5]
 ; XOP-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7]
 ; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
 ; XOP-NEXT:    vmovups %ymm0, 32(%rdi)
-; XOP-NEXT:    vmovups %ymm3, (%rdi)
-; XOP-NEXT:    vmovups %ymm2, 64(%rdi)
+; XOP-NEXT:    vmovups %ymm3, 64(%rdi)
+; XOP-NEXT:    vmovups %ymm2, (%rdi)
 ; XOP-NEXT:    vzeroupper
 ; XOP-NEXT:    retq
   %s1 = load <8 x i32>, <8 x i32>* %q1, align 4

Modified: llvm/trunk/test/CodeGen/X86/vec_saddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_saddo.ll?rev=369459&r1=369458&r2=369459&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_saddo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_saddo.ll Tue Aug 20 15:12:50 2019
@@ -1791,48 +1791,48 @@ define <2 x i32> @saddo_v2i128(<2 x i128
 ;
 ; AVX512-LABEL: saddo_v2i128:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbp
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT:    movq %rcx, %r14
-; AVX512-NEXT:    adcq %r11, %r14
-; AVX512-NEXT:    setns %bl
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    setns %cl
-; AVX512-NEXT:    cmpb %bl, %cl
-; AVX512-NEXT:    setne %bl
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    setns %al
-; AVX512-NEXT:    cmpb %al, %cl
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    andb %bl, %al
-; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    testq %r9, %r9
 ; AVX512-NEXT:    setns %al
 ; AVX512-NEXT:    testq %rsi, %rsi
-; AVX512-NEXT:    setns %cl
-; AVX512-NEXT:    cmpb %al, %cl
-; AVX512-NEXT:    sete %al
+; AVX512-NEXT:    setns %bl
+; AVX512-NEXT:    cmpb %al, %bl
+; AVX512-NEXT:    sete %bpl
 ; AVX512-NEXT:    addq %r8, %rdi
 ; AVX512-NEXT:    adcq %r9, %rsi
+; AVX512-NEXT:    setns %al
+; AVX512-NEXT:    cmpb %al, %bl
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    andb %bpl, %al
+; AVX512-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    movq %rcx, %rbp
+; AVX512-NEXT:    adcq %r10, %rbp
+; AVX512-NEXT:    setns %bl
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    setns %cl
+; AVX512-NEXT:    cmpb %bl, %cl
+; AVX512-NEXT:    setne %r8b
+; AVX512-NEXT:    testq %r10, %r10
 ; AVX512-NEXT:    setns %bl
 ; AVX512-NEXT:    cmpb %bl, %cl
-; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    andb %al, %cl
-; AVX512-NEXT:    andl $1, %ecx
-; AVX512-NEXT:    kmovw %ecx, %k1
+; AVX512-NEXT:    sete %cl
+; AVX512-NEXT:    andb %r8b, %cl
+; AVX512-NEXT:    kmovd %ecx, %k0
+; AVX512-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512-NEXT:    andl $1, %eax
+; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    korw %k0, %k1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    movq %rdx, 16(%r10)
-; AVX512-NEXT:    movq %rdi, (%r10)
-; AVX512-NEXT:    movq %r14, 24(%r10)
-; AVX512-NEXT:    movq %rsi, 8(%r10)
+; AVX512-NEXT:    movq %rdx, 16(%r11)
+; AVX512-NEXT:    movq %rdi, (%r11)
+; AVX512-NEXT:    movq %rbp, 24(%r11)
+; AVX512-NEXT:    movq %rsi, 8(%r11)
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    retq
   %t = call {<2 x i128>, <2 x i1>} @llvm.sadd.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
   %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0

Modified: llvm/trunk/test/CodeGen/X86/vec_smulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_smulo.ll?rev=369459&r1=369458&r2=369459&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_smulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_smulo.ll Tue Aug 20 15:12:50 2019
@@ -2605,9 +2605,9 @@ define <2 x i32> @smulo_v2i128(<2 x i128
 ; AVX512-NEXT:    cmpq $0, {{[0-9]+}}(%rsp)
 ; AVX512-NEXT:    setne %cl
 ; AVX512-NEXT:    kmovd %ecx, %k0
-; AVX512-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512-NEXT:    cmpq $0, {{[0-9]+}}(%rsp)
 ; AVX512-NEXT:    setne %cl
+; AVX512-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512-NEXT:    andl $1, %ecx
 ; AVX512-NEXT:    kmovw %ecx, %k1
 ; AVX512-NEXT:    korw %k0, %k1, %k1

Modified: llvm/trunk/test/CodeGen/X86/vec_ssubo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_ssubo.ll?rev=369459&r1=369458&r2=369459&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_ssubo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_ssubo.ll Tue Aug 20 15:12:50 2019
@@ -1830,48 +1830,48 @@ define <2 x i32> @ssubo_v2i128(<2 x i128
 ;
 ; AVX512-LABEL: ssubo_v2i128:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbp
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT:    movq %rcx, %r14
-; AVX512-NEXT:    sbbq %r11, %r14
-; AVX512-NEXT:    setns %bl
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    setns %cl
-; AVX512-NEXT:    cmpb %bl, %cl
-; AVX512-NEXT:    setne %bl
-; AVX512-NEXT:    testq %r11, %r11
-; AVX512-NEXT:    setns %al
-; AVX512-NEXT:    cmpb %al, %cl
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    andb %bl, %al
-; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    testq %r9, %r9
 ; AVX512-NEXT:    setns %al
 ; AVX512-NEXT:    testq %rsi, %rsi
-; AVX512-NEXT:    setns %cl
-; AVX512-NEXT:    cmpb %al, %cl
-; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    setns %bl
+; AVX512-NEXT:    cmpb %al, %bl
+; AVX512-NEXT:    setne %bpl
 ; AVX512-NEXT:    subq %r8, %rdi
 ; AVX512-NEXT:    sbbq %r9, %rsi
+; AVX512-NEXT:    setns %al
+; AVX512-NEXT:    cmpb %al, %bl
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    andb %bpl, %al
+; AVX512-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    movq %rcx, %rbp
+; AVX512-NEXT:    sbbq %r10, %rbp
+; AVX512-NEXT:    setns %bl
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    setns %cl
+; AVX512-NEXT:    cmpb %bl, %cl
+; AVX512-NEXT:    setne %r8b
+; AVX512-NEXT:    testq %r10, %r10
 ; AVX512-NEXT:    setns %bl
 ; AVX512-NEXT:    cmpb %bl, %cl
 ; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    andb %al, %cl
-; AVX512-NEXT:    andl $1, %ecx
-; AVX512-NEXT:    kmovw %ecx, %k1
+; AVX512-NEXT:    andb %r8b, %cl
+; AVX512-NEXT:    kmovd %ecx, %k0
+; AVX512-NEXT:    kshiftlw $1, %k0, %k0
+; AVX512-NEXT:    andl $1, %eax
+; AVX512-NEXT:    kmovw %eax, %k1
 ; AVX512-NEXT:    korw %k0, %k1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    movq %rdx, 16(%r10)
-; AVX512-NEXT:    movq %rdi, (%r10)
-; AVX512-NEXT:    movq %r14, 24(%r10)
-; AVX512-NEXT:    movq %rsi, 8(%r10)
+; AVX512-NEXT:    movq %rdx, 16(%r11)
+; AVX512-NEXT:    movq %rdi, (%r11)
+; AVX512-NEXT:    movq %rbp, 24(%r11)
+; AVX512-NEXT:    movq %rsi, 8(%r11)
 ; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    retq
   %t = call {<2 x i128>, <2 x i1>} @llvm.ssub.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
   %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0

Modified: llvm/trunk/test/CodeGen/X86/vec_uaddo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_uaddo.ll?rev=369459&r1=369458&r2=369459&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_uaddo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_uaddo.ll Tue Aug 20 15:12:50 2019
@@ -1282,16 +1282,16 @@ define <2 x i32> @uaddo_v2i128(<2 x i128
 ; AVX512-LABEL: uaddo_v2i128:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    addq %r8, %rdi
+; AVX512-NEXT:    adcq %r9, %rsi
+; AVX512-NEXT:    setb %r8b
 ; AVX512-NEXT:    addq {{[0-9]+}}(%rsp), %rdx
 ; AVX512-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    setb %al
 ; AVX512-NEXT:    kmovd %eax, %k0
 ; AVX512-NEXT:    kshiftlw $1, %k0, %k0
-; AVX512-NEXT:    addq %r8, %rdi
-; AVX512-NEXT:    adcq %r9, %rsi
-; AVX512-NEXT:    setb %al
-; AVX512-NEXT:    andl $1, %eax
-; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    andl $1, %r8d
+; AVX512-NEXT:    kmovw %r8d, %k1
 ; AVX512-NEXT:    korw %k0, %k1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}

Modified: llvm/trunk/test/CodeGen/X86/vec_umulo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_umulo.ll?rev=369459&r1=369458&r2=369459&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_umulo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_umulo.ll Tue Aug 20 15:12:50 2019
@@ -2451,66 +2451,68 @@ define <2 x i32> @umulo_v2i128(<2 x i128
 ; AVX512-NEXT:    pushq %r13
 ; AVX512-NEXT:    pushq %r12
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq %rcx, %rax
-; AVX512-NEXT:    movq %rdx, %r12
-; AVX512-NEXT:    movq %rdi, %r11
+; AVX512-NEXT:    movq %r9, %r10
+; AVX512-NEXT:    movq %rcx, %r9
+; AVX512-NEXT:    movq %rdx, %r11
+; AVX512-NEXT:    movq %rsi, %rax
+; AVX512-NEXT:    movq %rdi, %rsi
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
 ; AVX512-NEXT:    testq %r10, %r10
 ; AVX512-NEXT:    setne %dl
-; AVX512-NEXT:    testq %rcx, %rcx
-; AVX512-NEXT:    setne %r13b
-; AVX512-NEXT:    andb %dl, %r13b
-; AVX512-NEXT:    mulq %r15
-; AVX512-NEXT:    movq %rax, %rdi
+; AVX512-NEXT:    testq %rax, %rax
+; AVX512-NEXT:    setne %bl
+; AVX512-NEXT:    andb %dl, %bl
+; AVX512-NEXT:    mulq %r8
+; AVX512-NEXT:    movq %rax, %r13
 ; AVX512-NEXT:    seto %bpl
 ; AVX512-NEXT:    movq %r10, %rax
-; AVX512-NEXT:    mulq %r12
-; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    mulq %rdi
+; AVX512-NEXT:    movq %rax, %rdi
 ; AVX512-NEXT:    seto %cl
 ; AVX512-NEXT:    orb %bpl, %cl
-; AVX512-NEXT:    addq %rdi, %rbx
-; AVX512-NEXT:    movq %r12, %rax
-; AVX512-NEXT:    mulq %r15
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq %rdx, %r15
-; AVX512-NEXT:    addq %rbx, %r15
-; AVX512-NEXT:    setb %al
-; AVX512-NEXT:    orb %cl, %al
-; AVX512-NEXT:    orb %r13b, %al
-; AVX512-NEXT:    kmovd %eax, %k0
-; AVX512-NEXT:    kshiftlw $1, %k0, %k0
-; AVX512-NEXT:    testq %r9, %r9
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    testq %rsi, %rsi
-; AVX512-NEXT:    setne %cl
-; AVX512-NEXT:    andb %al, %cl
+; AVX512-NEXT:    addq %r13, %rdi
 ; AVX512-NEXT:    movq %rsi, %rax
 ; AVX512-NEXT:    mulq %r8
-; AVX512-NEXT:    movq %rax, %rsi
-; AVX512-NEXT:    seto %bpl
+; AVX512-NEXT:    movq %rax, %r8
+; AVX512-NEXT:    movq %rdx, %r10
+; AVX512-NEXT:    addq %rdi, %r10
+; AVX512-NEXT:    setb %sil
+; AVX512-NEXT:    orb %cl, %sil
+; AVX512-NEXT:    orb %bl, %sil
+; AVX512-NEXT:    testq %r12, %r12
+; AVX512-NEXT:    setne %al
+; AVX512-NEXT:    testq %r9, %r9
+; AVX512-NEXT:    setne %bpl
+; AVX512-NEXT:    andb %al, %bpl
 ; AVX512-NEXT:    movq %r9, %rax
-; AVX512-NEXT:    mulq %r11
+; AVX512-NEXT:    mulq %r15
 ; AVX512-NEXT:    movq %rax, %rdi
-; AVX512-NEXT:    seto %bl
-; AVX512-NEXT:    orb %bpl, %bl
-; AVX512-NEXT:    addq %rsi, %rdi
+; AVX512-NEXT:    seto %r9b
+; AVX512-NEXT:    movq %r12, %rax
+; AVX512-NEXT:    mulq %r11
+; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    seto %cl
+; AVX512-NEXT:    orb %r9b, %cl
+; AVX512-NEXT:    addq %rdi, %rbx
 ; AVX512-NEXT:    movq %r11, %rax
-; AVX512-NEXT:    mulq %r8
-; AVX512-NEXT:    addq %rdi, %rdx
-; AVX512-NEXT:    setb %sil
-; AVX512-NEXT:    orb %bl, %sil
-; AVX512-NEXT:    orb %cl, %sil
+; AVX512-NEXT:    mulq %r15
+; AVX512-NEXT:    addq %rbx, %rdx
+; AVX512-NEXT:    setb %dil
+; AVX512-NEXT:    orb %cl, %dil
+; AVX512-NEXT:    orb %bpl, %dil
+; AVX512-NEXT:    kmovd %edi, %k0
+; AVX512-NEXT:    kshiftlw $1, %k0, %k0
 ; AVX512-NEXT:    andl $1, %esi
 ; AVX512-NEXT:    kmovw %esi, %k1
 ; AVX512-NEXT:    korw %k0, %k1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; AVX512-NEXT:    movq %r10, 16(%r14)
-; AVX512-NEXT:    movq %rax, (%r14)
-; AVX512-NEXT:    movq %r15, 24(%r14)
-; AVX512-NEXT:    movq %rdx, 8(%r14)
+; AVX512-NEXT:    movq %rax, 16(%r14)
+; AVX512-NEXT:    movq %r8, (%r14)
+; AVX512-NEXT:    movq %rdx, 24(%r14)
+; AVX512-NEXT:    movq %r10, 8(%r14)
 ; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    popq %r12
 ; AVX512-NEXT:    popq %r13

Modified: llvm/trunk/test/CodeGen/X86/vec_usubo.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_usubo.ll?rev=369459&r1=369458&r2=369459&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_usubo.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_usubo.ll Tue Aug 20 15:12:50 2019
@@ -1329,16 +1329,16 @@ define <2 x i32> @usubo_v2i128(<2 x i128
 ; AVX512-LABEL: usubo_v2i128:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    subq %r8, %rdi
+; AVX512-NEXT:    sbbq %r9, %rsi
+; AVX512-NEXT:    setb %r8b
 ; AVX512-NEXT:    subq {{[0-9]+}}(%rsp), %rdx
 ; AVX512-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    setb %al
 ; AVX512-NEXT:    kmovd %eax, %k0
 ; AVX512-NEXT:    kshiftlw $1, %k0, %k0
-; AVX512-NEXT:    subq %r8, %rdi
-; AVX512-NEXT:    sbbq %r9, %rsi
-; AVX512-NEXT:    setb %al
-; AVX512-NEXT:    andl $1, %eax
-; AVX512-NEXT:    kmovw %eax, %k1
+; AVX512-NEXT:    andl $1, %r8d
+; AVX512-NEXT:    kmovw %r8d, %k1
 ; AVX512-NEXT:    korw %k0, %k1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}




More information about the llvm-commits mailing list