[llvm] r321070 - [X86][SSE] Use (V)PHMINPOSUW for vXi8 SMAX/SMIN/UMAX/UMIN horizontal reductions (PR32841)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 19 04:02:40 PST 2017


Author: rksimon
Date: Tue Dec 19 04:02:40 2017
New Revision: 321070

URL: http://llvm.org/viewvc/llvm-project?rev=321070&view=rev
Log:
[X86][SSE] Use (V)PHMINPOSUW for vXi8 SMAX/SMIN/UMAX/UMIN horizontal reductions (PR32841)

Extension to D39729 which performed this for vXi16, with the same bit flipping to handle SMAX/SMIN/UMAX cases, vXi8 UMIN horizontal reductions can be performed.

This makes use of the fact that by performing a pair-wise i8 SHUFFLE/UMIN before PHMINPOSUW, we both get the UMIN of each pair but also zero-extend the upper bits ready for v8i16.

Differential Revision: https://reviews.llvm.org/D41294

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll
    llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll
    llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll
    llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=321070&r1=321069&r2=321070&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Dec 19 04:02:40 2017
@@ -30482,7 +30482,8 @@ static SDValue createPSADBW(SelectionDAG
   return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
 }
 
-// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW.
+// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
+// PHMINPOSUW.
 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
                                              const X86Subtarget &Subtarget) {
   // Bail without SSE41.
@@ -30490,7 +30491,7 @@ static SDValue combineHorizontalMinMaxRe
     return SDValue();
 
   EVT ExtractVT = Extract->getValueType(0);
-  if (ExtractVT != MVT::i16)
+  if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
     return SDValue();
 
   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
@@ -30502,7 +30503,7 @@ static SDValue combineHorizontalMinMaxRe
 
   EVT SrcVT = Src.getValueType();
   EVT SrcSVT = SrcVT.getScalarType();
-  if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0)
+  if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
     return SDValue();
 
   SDLoc DL(Extract);
@@ -30518,22 +30519,39 @@ static SDValue combineHorizontalMinMaxRe
     SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
   }
-  assert(SrcVT == MVT::v8i16 && "Unexpected value type");
+  assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
+          (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
+         "Unexpected value type");
 
   // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
   // to flip the value accordingly.
   SDValue Mask;
+  unsigned MaskEltsBits = ExtractVT.getSizeInBits();
   if (BinOp == ISD::SMAX)
-    Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT);
+    Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
   else if (BinOp == ISD::SMIN)
-    Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT);
+    Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
   else if (BinOp == ISD::UMAX)
-    Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT);
+    Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
 
   if (Mask)
     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
 
-  MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos);
+  // For v16i8 cases we need to perform UMIN on pairs of byte elements,
+  // shuffling each upper element down and insert zeros. This means that the
+  // v16i8 UMIN will leave the upper element as zero, performing zero-extension
+  // ready for the PHMINPOS.
+  if (ExtractVT == MVT::i8) {
+    SDValue Upper = DAG.getVectorShuffle(
+        SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
+        {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
+    MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
+  }
+
+  // Perform the PHMINPOS on a v8i16 vector,
+  MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
+  MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
+  MinPos = DAG.getBitcast(SrcVT, MinPos);
 
   if (Mask)
     MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
@@ -30849,7 +30867,7 @@ static SDValue combineExtractVectorElt(S
   if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
     return Cmp;
 
-  // Attempt to replace min/max v8i16 reductions with PHMINPOSUW.
+  // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
   if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
     return MinMax;
 

Modified: llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll?rev=321070&r1=321069&r2=321070&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll (original)
+++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll Tue Dec 19 04:02:40 2017
@@ -309,30 +309,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %
 ;
 ; X86-SSE42-LABEL: test_reduce_v16i8:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v16i8:
 ; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX-NEXT:    retl
@@ -371,30 +366,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %
 ;
 ; X64-SSE42-LABEL: test_reduce_v16i8:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v16i8:
 ; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX-NEXT:    retq
@@ -906,16 +896,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-SSE42-LABEL: test_reduce_v32i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -924,14 +911,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -940,15 +925,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-AVX2-LABEL: test_reduce_v32i8:
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -994,16 +977,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-SSE42-LABEL: test_reduce_v32i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1012,14 +992,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1028,15 +1006,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX2-LABEL: test_reduce_v32i8:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1045,15 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX512-LABEL: test_reduce_v32i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1743,16 +1717,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-SSE42-NEXT:    pmaxsb %xmm3, %xmm1
 ; X86-SSE42-NEXT:    pmaxsb %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -1764,14 +1735,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1781,15 +1750,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1847,16 +1814,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-SSE42-NEXT:    pmaxsb %xmm3, %xmm1
 ; X64-SSE42-NEXT:    pmaxsb %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1868,14 +1832,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1885,15 +1847,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1902,17 +1862,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX512-LABEL: test_reduce_v64i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper

Modified: llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll?rev=321070&r1=321069&r2=321070&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll (original)
+++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll Tue Dec 19 04:02:40 2017
@@ -311,30 +311,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %
 ;
 ; X86-SSE42-LABEL: test_reduce_v16i8:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v16i8:
 ; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX-NEXT:    retl
@@ -373,30 +368,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %
 ;
 ; X64-SSE42-LABEL: test_reduce_v16i8:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v16i8:
 ; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX-NEXT:    retq
@@ -910,16 +900,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-SSE42-LABEL: test_reduce_v32i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -928,14 +915,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -944,15 +929,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-AVX2-LABEL: test_reduce_v32i8:
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -998,16 +981,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-SSE42-LABEL: test_reduce_v32i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1016,14 +996,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1032,15 +1010,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX2-LABEL: test_reduce_v32i8:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1049,15 +1025,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX512-LABEL: test_reduce_v32i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1745,16 +1719,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-SSE42-NEXT:    pminsb %xmm3, %xmm1
 ; X86-SSE42-NEXT:    pminsb %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -1766,14 +1737,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1783,15 +1752,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1849,16 +1816,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-SSE42-NEXT:    pminsb %xmm3, %xmm1
 ; X64-SSE42-NEXT:    pminsb %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1870,14 +1834,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1887,15 +1849,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1904,17 +1864,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX512-LABEL: test_reduce_v64i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper

Modified: llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll?rev=321070&r1=321069&r2=321070&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll (original)
+++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll Tue Dec 19 04:02:40 2017
@@ -362,30 +362,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %
 ;
 ; X86-SSE42-LABEL: test_reduce_v16i8:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v16i8:
 ; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX-NEXT:    retl
@@ -408,30 +403,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %
 ;
 ; X64-SSE42-LABEL: test_reduce_v16i8:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v16i8:
 ; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX-NEXT:    retq
@@ -1031,16 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-SSE42-LABEL: test_reduce_v32i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -1049,14 +1036,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1065,15 +1050,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-AVX2-LABEL: test_reduce_v32i8:
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1099,16 +1082,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-SSE42-LABEL: test_reduce_v32i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1117,14 +1097,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1133,15 +1111,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX2-LABEL: test_reduce_v32i8:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1150,15 +1126,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX512-LABEL: test_reduce_v32i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1992,16 +1966,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-SSE42-NEXT:    pmaxub %xmm3, %xmm1
 ; X86-SSE42-NEXT:    pmaxub %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
-; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    psrlw $8, %xmm2
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -2013,14 +1984,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -2030,15 +1999,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -2068,16 +2035,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-SSE42-NEXT:    pmaxub %xmm3, %xmm1
 ; X64-SSE42-NEXT:    pmaxub %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
-; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    psrlw $8, %xmm2
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm2
+; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -2089,14 +2053,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -2106,15 +2068,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -2123,17 +2083,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX512-LABEL: test_reduce_v64i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2
+; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
+; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper

Modified: llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll?rev=321070&r1=321069&r2=321070&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll (original)
+++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll Tue Dec 19 04:02:40 2017
@@ -352,30 +352,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %
 ;
 ; X86-SSE42-LABEL: test_reduce_v16i8:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    psrlw $8, %xmm1
 ; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX-LABEL: test_reduce_v16i8:
 ; X86-AVX:       ## %bb.0:
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX-NEXT:    retl
@@ -398,30 +387,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %
 ;
 ; X64-SSE42-LABEL: test_reduce_v16i8:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    psrlw $8, %xmm1
 ; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX-LABEL: test_reduce_v16i8:
 ; X64-AVX:       ## %bb.0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX-NEXT:    retq
@@ -1004,16 +982,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-SSE42-LABEL: test_reduce_v32i8:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    psrlw $8, %xmm1
 ; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -1022,14 +994,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1038,15 +1005,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X86-AVX2-LABEL: test_reduce_v32i8:
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1072,16 +1034,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-SSE42-LABEL: test_reduce_v32i8:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    psrlw $8, %xmm1
 ; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -1090,14 +1046,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -1106,15 +1057,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX2-LABEL: test_reduce_v32i8:
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -1123,15 +1069,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %
 ; X64-AVX512-LABEL: test_reduce_v32i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper
@@ -1942,16 +1883,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-SSE42-NEXT:    pminub %xmm3, %xmm1
 ; X86-SSE42-NEXT:    pminub %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    psrlw $8, %xmm1
 ; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    psrlw $8, %xmm0
-; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-SSE42-NEXT:    retl
@@ -1963,14 +1898,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1980,15 +1910,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X86-AVX2:       ## %bb.0:
 ; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X86-AVX2-NEXT:    vzeroupper
@@ -2018,16 +1943,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-SSE42-NEXT:    pminub %xmm3, %xmm1
 ; X64-SSE42-NEXT:    pminub %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    psrlw $8, %xmm1
 ; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    psrlw $8, %xmm0
-; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
 ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-SSE42-NEXT:    retq
@@ -2039,14 +1958,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX1-NEXT:    vzeroupper
@@ -2056,15 +1970,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX2:       ## %bb.0:
 ; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX2-NEXT:    vzeroupper
@@ -2073,17 +1982,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %
 ; X64-AVX512-LABEL: test_reduce_v64i8:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
 ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
-; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0
 ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax
 ; X64-AVX512-NEXT:    vzeroupper




More information about the llvm-commits mailing list