[llvm] r317439 - [X86][SSE] Tests for integer min/max horizontal reductions

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sun Nov 5 11:48:24 PST 2017


Author: rksimon
Date: Sun Nov  5 11:48:24 2017
New Revision: 317439

URL: http://llvm.org/viewvc/llvm-project?rev=317439&view=rev
Log:
[X86][SSE] Tests for integer min/max horizontal reductions

Matching patterns that vectorizers should have created for us. 

The experimental intrinsics should probably be added as well.

Added:
    llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll
    llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll
    llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll
    llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll

Added: llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll?rev=317439&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll (added)
+++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-smax.ll Sun Nov  5 11:48:24 2017
@@ -0,0 +1,1896 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2     | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2   | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx      | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2     | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2   | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx    | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2   | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    movd %xmm3, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT:    movd %xmm2, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm5, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X64-SSE2-NEXT:    por %xmm0, %xmm3
+; X64-SSE2-NEXT:    movq %xmm3, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    retq
+  %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %2 = icmp sgt <2 x i64> %a0, %1
+  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+  %4 = extractelement <2 x i64> %3, i32 0
+  ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovd %xmm0, %eax
+; X64-AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = icmp sgt <4 x i32> %a0, %1
+  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %5 = icmp sgt <4 x i32> %3, %4
+  %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+  %7 = extractelement <4 x i32> %6, i32 0
+  ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v8i16:
+; X86-SSE:       ## BB#0:
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
+; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_reduce_v8i16:
+; X64-SSE:       ## BB#0:
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE-NEXT:    psrld $16, %xmm1
+; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovd %xmm0, %eax
+; X64-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT:    retq
+  %1  = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp sgt <8 x i16> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+  %4  = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp sgt <8 x i16> %3, %4
+  %6  = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+  %7  = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp sgt <8 x i16> %6, %7
+  %9  = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+  %10 = extractelement <8 x i16> %9, i32 0
+  ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT:    retq
+  %1  = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp sgt <16 x i8> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+  %4  = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp sgt <16 x i8> %3, %4
+  %6  = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+  %7  = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp sgt <16 x i8> %6, %7
+  %9  = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+  %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp sgt <16 x i8> %9, %10
+  %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+  %13 = extractelement <16 x i8> %12, i32 0
+  ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT:    movd %xmm2, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm6, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm3, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm5, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm4, %xmm2
+; X64-SSE2-NEXT:    movq %xmm2, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = icmp sgt <4 x i64> %a0, %1
+  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %5 = icmp sgt <4 x i64> %3, %4
+  %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+  %7 = extractelement <4 x i64> %6, i32 0
+  ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp sgt <8 x i32> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+  %4  = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp sgt <8 x i32> %3, %4
+  %6  = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+  %7  = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp sgt <8 x i32> %6, %7
+  %9  = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+  %10 = extractelement <8 x i32> %9, i32 0
+  ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v16i16:
+; X86-SSE:       ## BB#0:
+; X86-SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
+; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE-LABEL: test_reduce_v16i16:
+; X64-SSE:       ## BB#0:
+; X64-SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE-NEXT:    psrld $16, %xmm1
+; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp sgt <16 x i16> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+  %4  = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp sgt <16 x i16> %3, %4
+  %6  = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+  %7  = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp sgt <16 x i16> %6, %7
+  %9  = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+  %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp sgt <16 x i16> %9, %10
+  %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+  %13 = extractelement <16 x i16> %12, i32 0
+  ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp sgt <32 x i8> %a0, %1
+  %3  = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+  %4  = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp sgt <32 x i8> %3, %4
+  %6  = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+  %7  = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp sgt <32 x i8> %6, %7
+  %9  = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+  %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp sgt <32 x i8> %9, %10
+  %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+  %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp sgt <32 x i8> %12, %13
+  %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+  %16 = extractelement <32 x i8> %15, i32 0
+  ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    subl $28, %esp
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm6
+; X86-SSE2-NEXT:    pand %xmm6, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X86-SSE2-NEXT:    por %xmm1, %xmm6
+; X86-SSE2-NEXT:    pand %xmm5, %xmm2
+; X86-SSE2-NEXT:    pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT:    por %xmm2, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm6
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
+; X86-SSE2-NEXT:    por %xmm6, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    addl $28, %esp
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm5
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm5
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT:    movapd %xmm2, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm8, %xmm6
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm6, %xmm8
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X64-SSE2-NEXT:    movdqa %xmm7, %xmm5
+; X64-SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm9, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm7, %xmm6
+; X64-SSE2-NEXT:    pand %xmm6, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm6
+; X64-SSE2-NEXT:    por %xmm0, %xmm6
+; X64-SSE2-NEXT:    pand %xmm8, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm8
+; X64-SSE2-NEXT:    por %xmm1, %xmm8
+; X64-SSE2-NEXT:    movdqa %xmm8, %xmm0
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm6
+; X64-SSE2-NEXT:    pandn %xmm8, %xmm1
+; X64-SSE2-NEXT:    por %xmm6, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm5, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm1, %xmm3
+; X64-SSE2-NEXT:    movq %xmm3, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT:    movdqa %xmm4, %xmm5
+; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm5
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT:    movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT:    movapd %xmm2, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp sgt <8 x i64> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+  %4  = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp sgt <8 x i64> %3, %4
+  %6  = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+  %7  = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp sgt <8 x i64> %6, %7
+  %9  = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+  %10 = extractelement <8 x i64> %9, i32 0
+  ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X86-SSE2-NEXT:    por %xmm1, %xmm5
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
+; X86-SSE2-NEXT:    por %xmm4, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxsd %xmm3, %xmm1
+; X86-SSE42-NEXT:    pmaxsd %xmm2, %xmm0
+; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT:    pand %xmm5, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X64-SSE2-NEXT:    por %xmm1, %xmm5
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
+; X64-SSE2-NEXT:    pand %xmm0, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm5, %xmm0
+; X64-SSE2-NEXT:    por %xmm4, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxsd %xmm3, %xmm1
+; X64-SSE42-NEXT:    pmaxsd %xmm2, %xmm0
+; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxsd %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxsd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp sgt <16 x i32> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+  %4  = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp sgt <16 x i32> %3, %4
+  %6  = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+  %7  = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp sgt <16 x i32> %6, %7
+  %9  = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+  %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp sgt <16 x i32> %9, %10
+  %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+  %13 = extractelement <16 x i32> %12, i32 0
+  ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v32i16:
+; X86-SSE:       ## BB#0:
+; X86-SSE-NEXT:    pmaxsw %xmm3, %xmm1
+; X86-SSE-NEXT:    pmaxsw %xmm2, %xmm0
+; X86-SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
+; X86-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE-LABEL: test_reduce_v32i16:
+; X64-SSE:       ## BB#0:
+; X64-SSE-NEXT:    pmaxsw %xmm3, %xmm1
+; X64-SSE-NEXT:    pmaxsw %xmm2, %xmm0
+; X64-SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE-NEXT:    psrld $16, %xmm1
+; X64-SSE-NEXT:    pmaxsw %xmm0, %xmm1
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpmaxsw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxsw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp sgt <32 x i16> %a0, %1
+  %3  = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+  %4  = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp sgt <32 x i16> %3, %4
+  %6  = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+  %7  = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp sgt <32 x i16> %6, %7
+  %9  = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+  %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp sgt <32 x i16> %9, %10
+  %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+  %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp sgt <32 x i16> %12, %13
+  %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+  %16 = extractelement <32 x i16> %15, i32 0
+  ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT:    pcmpgtb %xmm3, %xmm5
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X86-SSE2-NEXT:    por %xmm1, %xmm5
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    pcmpgtb %xmm5, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
+; X86-SSE2-NEXT:    por %xmm4, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxsb %xmm3, %xmm1
+; X86-SSE42-NEXT:    pmaxsb %xmm2, %xmm0
+; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X64-SSE2-NEXT:    pcmpgtb %xmm3, %xmm5
+; X64-SSE2-NEXT:    pand %xmm5, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X64-SSE2-NEXT:    por %xmm1, %xmm5
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT:    pcmpgtb %xmm5, %xmm0
+; X64-SSE2-NEXT:    pand %xmm0, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm5, %xmm0
+; X64-SSE2-NEXT:    por %xmm4, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxsb %xmm3, %xmm1
+; X64-SSE42-NEXT:    pmaxsb %xmm2, %xmm0
+; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp sgt <64 x i8> %a0, %1
+  %3  = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+  %4  = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp sgt <64 x i8> %3, %4
+  %6  = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+  %7  = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp sgt <64 x i8> %6, %7
+  %9  = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+  %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp sgt <64 x i8> %9, %10
+  %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+  %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp sgt <64 x i8> %12, %13
+  %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+  %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %17 = icmp sgt <64 x i8> %15, %16
+  %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+  %19 = extractelement <64 x i8> %18, i32 0
+  ret i8 %19
+}

Added: llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll?rev=317439&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll (added)
+++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-smin.ll Sun Nov  5 11:48:24 2017
@@ -0,0 +1,1898 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2     | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2   | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx      | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2     | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2   | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx    | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2   | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    movd %xmm3, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT:    movd %xmm2, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm5, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X64-SSE2-NEXT:    por %xmm0, %xmm3
+; X64-SSE2-NEXT:    movq %xmm3, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminsq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    retq
+  %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %2 = icmp slt <2 x i64> %a0, %1
+  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+  %4 = extractelement <2 x i64> %3, i32 0
+  ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovd %xmm0, %eax
+; X64-AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = icmp slt <4 x i32> %a0, %1
+  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %5 = icmp slt <4 x i32> %3, %4
+  %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+  %7 = extractelement <4 x i32> %6, i32 0
+  ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v8i16:
+; X86-SSE:       ## BB#0:
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
+; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE-LABEL: test_reduce_v8i16:
+; X64-SSE:       ## BB#0:
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE-NEXT:    psrld $16, %xmm1
+; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovd %xmm0, %eax
+; X64-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT:    retq
+  %1  = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp slt <8 x i16> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+  %4  = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp slt <8 x i16> %3, %4
+  %6  = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+  %7  = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp slt <8 x i16> %6, %7
+  %9  = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+  %10 = extractelement <8 x i16> %9, i32 0
+  ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT:    retq
+  %1  = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp slt <16 x i8> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+  %4  = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp slt <16 x i8> %3, %4
+  %6  = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+  %7  = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp slt <16 x i8> %6, %7
+  %9  = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+  %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp slt <16 x i8> %9, %10
+  %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+  %13 = extractelement <16 x i8> %12, i32 0
+  ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT:    movd %xmm2, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm6, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm3, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm5, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm4, %xmm2
+; X64-SSE2-NEXT:    movq %xmm2, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminsq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = icmp slt <4 x i64> %a0, %1
+  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %5 = icmp slt <4 x i64> %3, %4
+  %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+  %7 = extractelement <4 x i64> %6, i32 0
+  ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp slt <8 x i32> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+  %4  = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp slt <8 x i32> %3, %4
+  %6  = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+  %7  = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp slt <8 x i32> %6, %7
+  %9  = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+  %10 = extractelement <8 x i32> %9, i32 0
+  ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v16i16:
+; X86-SSE:       ## BB#0:
+; X86-SSE-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
+; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE-LABEL: test_reduce_v16i16:
+; X64-SSE:       ## BB#0:
+; X64-SSE-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE-NEXT:    psrld $16, %xmm1
+; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp slt <16 x i16> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+  %4  = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp slt <16 x i16> %3, %4
+  %6  = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+  %7  = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp slt <16 x i16> %6, %7
+  %9  = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+  %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp slt <16 x i16> %9, %10
+  %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+  %13 = extractelement <16 x i16> %12, i32 0
+  ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp slt <32 x i8> %a0, %1
+  %3  = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+  %4  = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp slt <32 x i8> %3, %4
+  %6  = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+  %7  = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp slt <32 x i8> %6, %7
+  %9  = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+  %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp slt <32 x i8> %9, %10
+  %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+  %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp slt <32 x i8> %12, %13
+  %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+  %16 = extractelement <32 x i8> %15, i32 0
+  ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    subl $28, %esp
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm6
+; X86-SSE2-NEXT:    pand %xmm6, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X86-SSE2-NEXT:    por %xmm1, %xmm6
+; X86-SSE2-NEXT:    pand %xmm5, %xmm2
+; X86-SSE2-NEXT:    pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT:    por %xmm2, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm6, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    addl $28, %esp
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm5
+; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm5
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT:    movapd %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,0,2147483648,0]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm8, %xmm6
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm6, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm7
+; X64-SSE2-NEXT:    movdqa %xmm7, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm8, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm7, %xmm6
+; X64-SSE2-NEXT:    pand %xmm6, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X64-SSE2-NEXT:    por %xmm1, %xmm6
+; X64-SSE2-NEXT:    pand %xmm5, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X64-SSE2-NEXT:    por %xmm0, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm5
+; X64-SSE2-NEXT:    pandn %xmm6, %xmm1
+; X64-SSE2-NEXT:    por %xmm5, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm9
+; X64-SSE2-NEXT:    movdqa %xmm9, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm2, %xmm9
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm4, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm1, %xmm3
+; X64-SSE2-NEXT:    movq %xmm3, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT:    movdqa %xmm3, %xmm5
+; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm5
+; X64-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT:    movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT:    movapd %xmm3, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp slt <8 x i64> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+  %4  = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp slt <8 x i64> %3, %4
+  %6  = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+  %7  = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp slt <8 x i64> %6, %7
+  %9  = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+  %10 = extractelement <8 x i64> %9, i32 0
+  ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X86-SSE2-NEXT:    por %xmm0, %xmm5
+; X86-SSE2-NEXT:    pand %xmm4, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm4
+; X86-SSE2-NEXT:    por %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminsd %xmm3, %xmm1
+; X86-SSE42-NEXT:    pminsd %xmm2, %xmm0
+; X86-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminsd %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; X64-SSE2-NEXT:    pand %xmm5, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X64-SSE2-NEXT:    por %xmm0, %xmm5
+; X64-SSE2-NEXT:    pand %xmm4, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm4
+; X64-SSE2-NEXT:    por %xmm1, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
+; X64-SSE2-NEXT:    pand %xmm0, %xmm5
+; X64-SSE2-NEXT:    pandn %xmm4, %xmm0
+; X64-SSE2-NEXT:    por %xmm5, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminsd %xmm3, %xmm1
+; X64-SSE42-NEXT:    pminsd %xmm2, %xmm0
+; X64-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminsd %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminsd %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminsd %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminsd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp slt <16 x i32> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+  %4  = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp slt <16 x i32> %3, %4
+  %6  = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+  %7  = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp slt <16 x i32> %6, %7
+  %9  = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+  %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp slt <16 x i32> %9, %10
+  %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+  %13 = extractelement <16 x i32> %12, i32 0
+  ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE-LABEL: test_reduce_v32i16:
+; X86-SSE:       ## BB#0:
+; X86-SSE-NEXT:    pminsw %xmm3, %xmm1
+; X86-SSE-NEXT:    pminsw %xmm2, %xmm0
+; X86-SSE-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE-NEXT:    pminsw %xmm1, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE-NEXT:    psrld $16, %xmm1
+; X86-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X86-SSE-NEXT:    movd %xmm1, %eax
+; X86-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE-LABEL: test_reduce_v32i16:
+; X64-SSE:       ## BB#0:
+; X64-SSE-NEXT:    pminsw %xmm3, %xmm1
+; X64-SSE-NEXT:    pminsw %xmm2, %xmm0
+; X64-SSE-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE-NEXT:    pminsw %xmm1, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE-NEXT:    psrld $16, %xmm1
+; X64-SSE-NEXT:    pminsw %xmm0, %xmm1
+; X64-SSE-NEXT:    movd %xmm1, %eax
+; X64-SSE-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpminsw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminsw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminsw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp slt <32 x i16> %a0, %1
+  %3  = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+  %4  = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp slt <32 x i16> %3, %4
+  %6  = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+  %7  = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp slt <32 x i16> %6, %7
+  %9  = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+  %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp slt <32 x i16> %9, %10
+  %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+  %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp slt <32 x i16> %12, %13
+  %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+  %16 = extractelement <32 x i16> %15, i32 0
+  ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm5
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X86-SSE2-NEXT:    por %xmm0, %xmm5
+; X86-SSE2-NEXT:    pand %xmm4, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm4
+; X86-SSE2-NEXT:    por %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    pcmpgtb %xmm5, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminsb %xmm3, %xmm1
+; X86-SSE42-NEXT:    pminsb %xmm2, %xmm0
+; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm5
+; X64-SSE2-NEXT:    pand %xmm5, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X64-SSE2-NEXT:    por %xmm0, %xmm5
+; X64-SSE2-NEXT:    pand %xmm4, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm4
+; X64-SSE2-NEXT:    por %xmm1, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT:    pcmpgtb %xmm5, %xmm0
+; X64-SSE2-NEXT:    pand %xmm0, %xmm5
+; X64-SSE2-NEXT:    pandn %xmm4, %xmm0
+; X64-SSE2-NEXT:    por %xmm5, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X64-SSE2-NEXT:    por %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminsb %xmm3, %xmm1
+; X64-SSE42-NEXT:    pminsb %xmm2, %xmm0
+; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp slt <64 x i8> %a0, %1
+  %3  = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+  %4  = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp slt <64 x i8> %3, %4
+  %6  = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+  %7  = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp slt <64 x i8> %6, %7
+  %9  = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+  %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp slt <64 x i8> %9, %10
+  %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+  %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp slt <64 x i8> %12, %13
+  %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+  %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %17 = icmp slt <64 x i8> %15, %16
+  %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+  %19 = extractelement <64 x i8> %18, i32 0
+  ret i8 %19
+}

Added: llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll?rev=317439&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll (added)
+++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-umax.ll Sun Nov  5 11:48:24 2017
@@ -0,0 +1,2203 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2     | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2   | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx      | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2     | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2   | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx    | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2   | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    movd %xmm3, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT:    movd %xmm2, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; X86-AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm5, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X64-SSE2-NEXT:    por %xmm0, %xmm3
+; X64-SSE2-NEXT:    movq %xmm3, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxuq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    retq
+  %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %2 = icmp ugt <2 x i64> %a0, %1
+  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+  %4 = extractelement <2 x i64> %3, i32 0
+  ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm3, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X64-SSE2-NEXT:    por %xmm0, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm3
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm3, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovd %xmm0, %eax
+; X64-AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = icmp ugt <4 x i32> %a0, %1
+  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %5 = icmp ugt <4 x i32> %3, %4
+  %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+  %7 = extractelement <4 x i32> %6, i32 0
+  ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE2-NEXT:    pcmpgtw %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE2-NEXT:    pcmpgtw %xmm4, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm3
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm3, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X86-SSE2-NEXT:    por %xmm2, %xmm3
+; X86-SSE2-NEXT:    movd %xmm3, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X64-SSE2-NEXT:    pcmpgtw %xmm4, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm3
+; X64-SSE2-NEXT:    por %xmm0, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X64-SSE2-NEXT:    pcmpgtw %xmm4, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm3
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm3, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm1
+; X64-SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm2, %xmm3
+; X64-SSE2-NEXT:    movd %xmm3, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovd %xmm0, %eax
+; X64-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT:    retq
+  %1  = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ugt <8 x i16> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+  %4  = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ugt <8 x i16> %3, %4
+  %6  = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+  %7  = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ugt <8 x i16> %6, %7
+  %9  = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+  %10 = extractelement <8 x i16> %9, i32 0
+  ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT:    retq
+  %1  = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ugt <16 x i8> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+  %4  = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ugt <16 x i8> %3, %4
+  %6  = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+  %7  = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ugt <16 x i8> %6, %7
+  %9  = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+  %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ugt <16 x i8> %9, %10
+  %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+  %13 = extractelement <16 x i8> %12, i32 0
+  ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm4
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT:    movd %xmm2, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; X86-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; X86-AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT:    vpxor %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm3
+; X86-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm6, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm3, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm5, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm4, %xmm2
+; X64-SSE2-NEXT:    movq %xmm2, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm4
+; X64-SSE42-NEXT:    pxor %xmm3, %xmm4
+; X64-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; X64-AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm3
+; X64-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = icmp ugt <4 x i64> %a0, %1
+  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %5 = icmp ugt <4 x i64> %3, %4
+  %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+  %7 = extractelement <4 x i64> %6, i32 0
+  ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm4, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X86-SSE2-NEXT:    por %xmm1, %xmm3
+; X86-SSE2-NEXT:    movd %xmm3, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm4, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm1, %xmm3
+; X64-SSE2-NEXT:    movd %xmm3, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ugt <8 x i32> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+  %4  = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ugt <8 x i32> %3, %4
+  %6  = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+  %7  = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ugt <8 x i32> %6, %7
+  %9  = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+  %10 = extractelement <8 x i32> %9, i32 0
+  ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pcmpgtw %xmm3, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm4, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtw %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X86-SSE2-NEXT:    por %xmm1, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtw %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm3, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    pcmpgtw %xmm3, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm4, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    pcmpgtw %xmm4, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm1, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtw %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm3
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm3, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ugt <16 x i16> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+  %4  = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ugt <16 x i16> %3, %4
+  %6  = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+  %7  = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ugt <16 x i16> %6, %7
+  %9  = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+  %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ugt <16 x i16> %9, %10
+  %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+  %13 = extractelement <16 x i16> %12, i32 0
+  ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ugt <32 x i8> %a0, %1
+  %3  = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+  %4  = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ugt <32 x i8> %3, %4
+  %6  = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+  %7  = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ugt <32 x i8> %6, %7
+  %9  = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+  %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ugt <32 x i8> %9, %10
+  %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+  %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp ugt <32 x i8> %12, %13
+  %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+  %16 = extractelement <32 x i8> %15, i32 0
+  ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    subl $28, %esp
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm6
+; X86-SSE2-NEXT:    pand %xmm6, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X86-SSE2-NEXT:    por %xmm1, %xmm6
+; X86-SSE2-NEXT:    pand %xmm5, %xmm2
+; X86-SSE2-NEXT:    pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT:    por %xmm2, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm6
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
+; X86-SSE2-NEXT:    por %xmm6, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    addl $28, %esp
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm6 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm6, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE42-NEXT:    pxor %xmm6, %xmm5
+; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm5
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm7
+; X86-SSE42-NEXT:    pxor %xmm6, %xmm7
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm6, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT:    movapd %xmm3, %xmm1
+; X86-SSE42-NEXT:    xorpd %xmm6, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm2, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm6, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm6, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm6
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm5
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm2
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm4
+; X86-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm3
+; X86-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm8, %xmm6
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm6, %xmm8
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X64-SSE2-NEXT:    movdqa %xmm7, %xmm5
+; X64-SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm9, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm7, %xmm6
+; X64-SSE2-NEXT:    pand %xmm6, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm6
+; X64-SSE2-NEXT:    por %xmm0, %xmm6
+; X64-SSE2-NEXT:    pand %xmm8, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm8
+; X64-SSE2-NEXT:    por %xmm1, %xmm8
+; X64-SSE2-NEXT:    movdqa %xmm8, %xmm0
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm6
+; X64-SSE2-NEXT:    pandn %xmm8, %xmm1
+; X64-SSE2-NEXT:    por %xmm6, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm5, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm1, %xmm3
+; X64-SSE2-NEXT:    movq %xmm3, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm6 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm6, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm5
+; X64-SSE42-NEXT:    pxor %xmm6, %xmm5
+; X64-SSE42-NEXT:    pcmpgtq %xmm0, %xmm5
+; X64-SSE42-NEXT:    movdqa %xmm2, %xmm7
+; X64-SSE42-NEXT:    pxor %xmm6, %xmm7
+; X64-SSE42-NEXT:    movdqa %xmm4, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm6, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT:    movdqa %xmm5, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT:    movapd %xmm3, %xmm1
+; X64-SSE42-NEXT:    xorpd %xmm6, %xmm1
+; X64-SSE42-NEXT:    movapd %xmm2, %xmm0
+; X64-SSE42-NEXT:    xorpd %xmm6, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm6, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm6
+; X64-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm5
+; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm2
+; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm3, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
+; X64-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm4
+; X64-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm3
+; X64-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT:    vpcmpgtq %ymm2, %ymm3, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ugt <8 x i64> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+  %4  = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ugt <8 x i64> %3, %4
+  %6  = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+  %7  = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ugt <8 x i64> %6, %7
+  %9  = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+  %10 = extractelement <8 x i64> %9, i32 0
+  ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT:    pand %xmm7, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm7
+; X86-SSE2-NEXT:    por %xmm0, %xmm7
+; X86-SSE2-NEXT:    pand %xmm6, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X86-SSE2-NEXT:    por %xmm1, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm7
+; X86-SSE2-NEXT:    pandn %xmm6, %xmm1
+; X86-SSE2-NEXT:    por %xmm7, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxud %xmm3, %xmm1
+; X86-SSE42-NEXT:    pmaxud %xmm2, %xmm0
+; X86-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxud %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X64-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT:    pand %xmm7, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm7
+; X64-SSE2-NEXT:    por %xmm0, %xmm7
+; X64-SSE2-NEXT:    pand %xmm6, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X64-SSE2-NEXT:    por %xmm1, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm6, %xmm0
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm7
+; X64-SSE2-NEXT:    pandn %xmm6, %xmm1
+; X64-SSE2-NEXT:    por %xmm7, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm4, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxud %xmm3, %xmm1
+; X64-SSE42-NEXT:    pmaxud %xmm2, %xmm0
+; X64-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxud %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxud %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxud %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ugt <16 x i32> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+  %4  = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ugt <16 x i32> %3, %4
+  %6  = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+  %7  = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ugt <16 x i32> %6, %7
+  %9  = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+  %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ugt <16 x i32> %9, %10
+  %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+  %13 = extractelement <16 x i32> %12, i32 0
+  ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    pcmpgtw %xmm5, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X86-SSE2-NEXT:    pcmpgtw %xmm5, %xmm7
+; X86-SSE2-NEXT:    pand %xmm7, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm7
+; X86-SSE2-NEXT:    por %xmm0, %xmm7
+; X86-SSE2-NEXT:    pand %xmm6, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X86-SSE2-NEXT:    por %xmm1, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm7
+; X86-SSE2-NEXT:    pandn %xmm6, %xmm1
+; X86-SSE2-NEXT:    por %xmm7, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    pcmpgtw %xmm3, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X86-SSE2-NEXT:    pcmpgtw %xmm4, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxuw %xmm3, %xmm1
+; X86-SSE42-NEXT:    pmaxuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm5
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE2-NEXT:    pcmpgtw %xmm5, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm7
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X64-SSE2-NEXT:    pcmpgtw %xmm5, %xmm7
+; X64-SSE2-NEXT:    pand %xmm7, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm7
+; X64-SSE2-NEXT:    por %xmm0, %xmm7
+; X64-SSE2-NEXT:    pand %xmm6, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X64-SSE2-NEXT:    por %xmm1, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm6, %xmm0
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm7
+; X64-SSE2-NEXT:    pandn %xmm6, %xmm1
+; X64-SSE2-NEXT:    por %xmm7, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X64-SSE2-NEXT:    pcmpgtw %xmm3, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X64-SSE2-NEXT:    pcmpgtw %xmm3, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X64-SSE2-NEXT:    pcmpgtw %xmm4, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxuw %xmm3, %xmm1
+; X64-SSE42-NEXT:    pmaxuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pmaxuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpmaxuw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ugt <32 x i16> %a0, %1
+  %3  = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+  %4  = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ugt <32 x i16> %3, %4
+  %6  = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+  %7  = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ugt <32 x i16> %6, %7
+  %9  = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+  %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ugt <32 x i16> %9, %10
+  %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+  %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp ugt <32 x i16> %12, %13
+  %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+  %16 = extractelement <32 x i16> %15, i32 0
+  ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pmaxub %xmm3, %xmm1
+; X86-SSE2-NEXT:    pmaxub %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pmaxub %xmm3, %xmm1
+; X86-SSE42-NEXT:    pmaxub %xmm2, %xmm0
+; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pmaxub %xmm3, %xmm1
+; X64-SSE2-NEXT:    pmaxub %xmm2, %xmm0
+; X64-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pmaxub %xmm3, %xmm1
+; X64-SSE42-NEXT:    pmaxub %xmm2, %xmm0
+; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ugt <64 x i8> %a0, %1
+  %3  = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+  %4  = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ugt <64 x i8> %3, %4
+  %6  = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+  %7  = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ugt <64 x i8> %6, %7
+  %9  = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+  %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ugt <64 x i8> %9, %10
+  %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+  %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp ugt <64 x i8> %12, %13
+  %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+  %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %17 = icmp ugt <64 x i8> %15, %16
+  %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+  %19 = extractelement <64 x i8> %18, i32 0
+  ret i8 %19
+}

Added: llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll?rev=317439&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll (added)
+++ llvm/trunk/test/CodeGen/X86/horizontal-reduce-umin.ll Sun Nov  5 11:48:24 2017
@@ -0,0 +1,2207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2     | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse4.2   | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE --check-prefix=X86-SSE42
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx      | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2     | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2   | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE --check-prefix=X64-SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx    | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2   | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX512
+
+;
+; 128-bit Vectors
+;
+
+define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v2i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X86-SSE2-NEXT:    por %xmm0, %xmm3
+; X86-SSE2-NEXT:    movd %xmm3, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v2i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE42-NEXT:    pxor %xmm0, %xmm3
+; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT:    movd %xmm2, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v2i64:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; X86-AVX-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; X86-AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v2i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm5, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm3
+; X64-SSE2-NEXT:    por %xmm0, %xmm3
+; X64-SSE2-NEXT:    movq %xmm3, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v2i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE42-NEXT:    pxor %xmm0, %xmm3
+; X64-SSE42-NEXT:    pxor %xmm2, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v2i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v2i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
+; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v2i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminuq %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    retq
+  %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %2 = icmp ult <2 x i64> %a0, %1
+  %3 = select <2 x i1> %2, <2 x i64> %a0, <2 x i64> %1
+  %4 = extractelement <2 x i64> %3, i32 0
+  ret i64 %4
+}
+
+define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminud %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v4i32:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm4, %xmm2
+; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminud %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v4i32:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovd %xmm0, %eax
+; X64-AVX-NEXT:    retq
+  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = icmp ult <4 x i32> %a0, %1
+  %3 = select <4 x i1> %2, <4 x i32> %a0, <4 x i32> %1
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %5 = icmp ult <4 x i32> %3, %4
+  %6 = select <4 x i1> %5, <4 x i32> %3, <4 x i32> %4
+  %7 = extractelement <4 x i32> %6, i32 0
+  ret i32 %7
+}
+
+define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE2-NEXT:    pcmpgtw %xmm2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X86-SSE2-NEXT:    por %xmm4, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtw %xmm2, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    por %xmm3, %xmm1
+; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v8i16:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovd %xmm0, %eax
+; X86-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X64-SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X64-SSE2-NEXT:    pcmpgtw %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm4, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm1, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm1
+; X64-SSE2-NEXT:    pcmpgtw %xmm2, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm3
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X64-SSE2-NEXT:    por %xmm3, %xmm1
+; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v8i16:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovd %xmm0, %eax
+; X64-AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX-NEXT:    retq
+  %1  = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ult <8 x i16> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i16> %a0, <8 x i16> %1
+  %4  = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ult <8 x i16> %3, %4
+  %6  = select <8 x i1> %5, <8 x i16> %3, <8 x i16> %4
+  %7  = shufflevector <8 x i16> %6, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ult <8 x i16> %6, %7
+  %9  = select <8 x i1> %8, <8 x i16> %6, <8 x i16> %7
+  %10 = extractelement <8 x i16> %9, i32 0
+  ret i16 %10
+}
+
+define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX-LABEL: test_reduce_v16i8:
+; X86-AVX:       ## BB#0:
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX-LABEL: test_reduce_v16i8:
+; X64-AVX:       ## BB#0:
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX-NEXT:    retq
+  %1  = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ult <16 x i8> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i8> %a0, <16 x i8> %1
+  %4  = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ult <16 x i8> %3, %4
+  %6  = select <16 x i1> %5, <16 x i8> %3, <16 x i8> %4
+  %7  = shufflevector <16 x i8> %6, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ult <16 x i8> %6, %7
+  %9  = select <16 x i1> %8, <16 x i8> %6, <16 x i8> %7
+  %10 = shufflevector <16 x i8> %9, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ult <16 x i8> %9, %10
+  %12 = select <16 x i1> %11, <16 x i8> %9, <16 x i8> %10
+  %13 = extractelement <16 x i8> %12, i32 0
+  ret i8 %13
+}
+
+;
+; 256-bit Vectors
+;
+
+define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v4i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v4i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm4
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm3
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X86-SSE42-NEXT:    movd %xmm2, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v4i64:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm4
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; X86-AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT:    vpxor %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v4i64:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v4i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm3, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm6, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm3, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm5, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm4, %xmm2
+; X64-SSE2-NEXT:    movq %xmm2, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v4i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    movdqa %xmm2, %xmm4
+; X64-SSE42-NEXT:    pxor %xmm3, %xmm4
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE42-NEXT:    pcmpgtq %xmm0, %xmm3
+; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
+; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v4i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm4
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; X64-AVX1-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
+; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v4i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v4i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminuq %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %2 = icmp ult <4 x i64> %a0, %1
+  %3 = select <4 x i1> %2, <4 x i64> %a0, <4 x i64> %1
+  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %5 = icmp ult <4 x i64> %3, %4
+  %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %4
+  %7 = extractelement <4 x i64> %6, i32 0
+  ret i64 %7
+}
+
+define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X86-SSE2-NEXT:    por %xmm4, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm3
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm3, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminud %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i32:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i32:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm4, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm3
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm3, %xmm2
+; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminud %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i32:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i32:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i32:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ult <8 x i32> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i32> %a0, <8 x i32> %1
+  %4  = shufflevector <8 x i32> %3, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ult <8 x i32> %3, %4
+  %6  = select <8 x i1> %5, <8 x i32> %3, <8 x i32> %4
+  %7  = shufflevector <8 x i32> %6, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ult <8 x i32> %6, %7
+  %9  = select <8 x i1> %8, <8 x i32> %6, <8 x i32> %7
+  %10 = extractelement <8 x i32> %9, i32 0
+  ret i32 %10
+}
+
+define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X86-SSE2-NEXT:    por %xmm4, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm3
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm4
+; X86-SSE2-NEXT:    por %xmm3, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtw %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i16:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i16:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    pcmpgtw %xmm3, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X64-SSE2-NEXT:    por %xmm0, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm4, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm4
+; X64-SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm3
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm4
+; X64-SSE2-NEXT:    por %xmm3, %xmm4
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm2
+; X64-SSE2-NEXT:    pcmpgtw %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm4
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm4, %xmm2
+; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i16:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i16:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i16:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ult <16 x i16> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i16> %a0, <16 x i16> %1
+  %4  = shufflevector <16 x i16> %3, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ult <16 x i16> %3, %4
+  %6  = select <16 x i1> %5, <16 x i16> %3, <16 x i16> %4
+  %7  = shufflevector <16 x i16> %6, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ult <16 x i16> %6, %7
+  %9  = select <16 x i1> %8, <16 x i16> %6, <16 x i16> %7
+  %10 = shufflevector <16 x i16> %9, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ult <16 x i16> %9, %10
+  %12 = select <16 x i1> %11, <16 x i16> %9, <16 x i16> %10
+  %13 = extractelement <16 x i16> %12, i32 0
+  ret i16 %13
+}
+
+define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i8:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i8:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i8:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i8:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i8:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ult <32 x i8> %a0, %1
+  %3  = select <32 x i1> %2, <32 x i8> %a0, <32 x i8> %1
+  %4  = shufflevector <32 x i8> %3, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ult <32 x i8> %3, %4
+  %6  = select <32 x i1> %5, <32 x i8> %3, <32 x i8> %4
+  %7  = shufflevector <32 x i8> %6, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ult <32 x i8> %6, %7
+  %9  = select <32 x i1> %8, <32 x i8> %6, <32 x i8> %7
+  %10 = shufflevector <32 x i8> %9, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ult <32 x i8> %9, %10
+  %12 = select <32 x i1> %11, <32 x i8> %9, <32 x i8> %10
+  %13 = shufflevector <32 x i8> %12, <32 x i8> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp ult <32 x i8> %12, %13
+  %15 = select <32 x i1> %14, <32 x i8> %12, <32 x i8> %13
+  %16 = extractelement <32 x i8> %15, i32 0
+  ret i8 %16
+}
+
+;
+; 512-bit Vectors
+;
+
+define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+; X86-SSE2-LABEL: test_reduce_v8i64:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    subl $28, %esp
+; X86-SSE2-NEXT:    .cfi_def_cfa_offset 32
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, (%esp) ## 16-byte Spill
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT:    pand %xmm6, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm6
+; X86-SSE2-NEXT:    pand %xmm6, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X86-SSE2-NEXT:    por %xmm1, %xmm6
+; X86-SSE2-NEXT:    pand %xmm5, %xmm2
+; X86-SSE2-NEXT:    pandn (%esp), %xmm5 ## 16-byte Folded Reload
+; X86-SSE2-NEXT:    por %xmm2, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm6, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; X86-SSE2-NEXT:    movd %xmm0, %edx
+; X86-SSE2-NEXT:    addl $28, %esp
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v8i64:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm7
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm7
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
+; X86-SSE42-NEXT:    movapd %xmm2, %xmm1
+; X86-SSE42-NEXT:    xorpd %xmm4, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm3, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm4
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm1, %edx
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v8i64:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm5
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm4
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X86-AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X86-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v8i64:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm4
+; X86-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; X86-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X86-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; X86-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v8i64:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X64-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm8, %xmm6
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm6, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm7
+; X64-SSE2-NEXT:    movdqa %xmm7, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm8, %xmm7
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm7, %xmm6
+; X64-SSE2-NEXT:    pand %xmm6, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm6
+; X64-SSE2-NEXT:    por %xmm1, %xmm6
+; X64-SSE2-NEXT:    pand %xmm5, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X64-SSE2-NEXT:    por %xmm0, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm6, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm3, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm5
+; X64-SSE2-NEXT:    pandn %xmm6, %xmm1
+; X64-SSE2-NEXT:    por %xmm5, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm9, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm9
+; X64-SSE2-NEXT:    movdqa %xmm9, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; X64-SSE2-NEXT:    pcmpeqd %xmm2, %xmm9
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE2-NEXT:    pand %xmm4, %xmm2
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE2-NEXT:    por %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm1, %xmm3
+; X64-SSE2-NEXT:    movq %xmm3, %rax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v8i64:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm5
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm2, %xmm6
+; X64-SSE42-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE42-NEXT:    pcmpgtq %xmm0, %xmm6
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm7
+; X64-SSE42-NEXT:    pxor %xmm4, %xmm7
+; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X64-SSE42-NEXT:    movdqa %xmm6, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
+; X64-SSE42-NEXT:    movapd %xmm2, %xmm1
+; X64-SSE42-NEXT:    xorpd %xmm4, %xmm1
+; X64-SSE42-NEXT:    movapd %xmm3, %xmm0
+; X64-SSE42-NEXT:    xorpd %xmm4, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm4
+; X64-SSE42-NEXT:    pcmpgtq %xmm0, %xmm4
+; X64-SSE42-NEXT:    movdqa %xmm4, %xmm0
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v8i64:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm4
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm5
+; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm4
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm4
+; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
+; X64-AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm3
+; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm3
+; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; X64-AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v8i64:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; X64-AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm4
+; X64-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
+; X64-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm0, %ymm3
+; X64-AVX2-NEXT:    vxorpd %ymm2, %ymm1, %ymm2
+; X64-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v8i64:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ult <8 x i64> %a0, %1
+  %3  = select <8 x i1> %2, <8 x i64> %a0, <8 x i64> %1
+  %4  = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ult <8 x i64> %3, %4
+  %6  = select <8 x i1> %5, <8 x i64> %3, <8 x i64> %4
+  %7  = shufflevector <8 x i64> %6, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ult <8 x i64> %6, %7
+  %9  = select <8 x i1> %8, <8 x i64> %6, <8 x i64> %7
+  %10 = extractelement <8 x i64> %9, i32 0
+  ret i64 %10
+}
+
+define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+; X86-SSE2-LABEL: test_reduce_v16i32:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
+; X86-SSE2-NEXT:    pand %xmm7, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm7
+; X86-SSE2-NEXT:    por %xmm1, %xmm7
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X86-SSE2-NEXT:    por %xmm0, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm7, %xmm1
+; X86-SSE2-NEXT:    por %xmm5, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X86-SSE2-NEXT:    por %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm3
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm4
+; X86-SSE2-NEXT:    por %xmm3, %xmm4
+; X86-SSE2-NEXT:    movd %xmm4, %eax
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v16i32:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminud %xmm3, %xmm1
+; X86-SSE42-NEXT:    pminud %xmm2, %xmm0
+; X86-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminud %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X86-SSE42-NEXT:    movd %xmm0, %eax
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v16i32:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v16i32:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v16i32:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X64-SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X64-SSE2-NEXT:    pcmpgtd %xmm6, %xmm7
+; X64-SSE2-NEXT:    pand %xmm7, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm7
+; X64-SSE2-NEXT:    por %xmm1, %xmm7
+; X64-SSE2-NEXT:    pand %xmm5, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X64-SSE2-NEXT:    por %xmm0, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm5
+; X64-SSE2-NEXT:    pandn %xmm7, %xmm1
+; X64-SSE2-NEXT:    por %xmm5, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X64-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm1, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X64-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm3
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm4
+; X64-SSE2-NEXT:    por %xmm3, %xmm4
+; X64-SSE2-NEXT:    movd %xmm4, %eax
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v16i32:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminud %xmm3, %xmm1
+; X64-SSE42-NEXT:    pminud %xmm2, %xmm0
+; X64-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminud %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminud %xmm1, %xmm0
+; X64-SSE42-NEXT:    movd %xmm0, %eax
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v16i32:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminud %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v16i32:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v16i32:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminud %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ult <16 x i32> %a0, %1
+  %3  = select <16 x i1> %2, <16 x i32> %a0, <16 x i32> %1
+  %4  = shufflevector <16 x i32> %3, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ult <16 x i32> %3, %4
+  %6  = select <16 x i1> %5, <16 x i32> %3, <16 x i32> %4
+  %7  = shufflevector <16 x i32> %6, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ult <16 x i32> %6, %7
+  %9  = select <16 x i1> %8, <16 x i32> %6, <16 x i32> %7
+  %10 = shufflevector <16 x i32> %9, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ult <16 x i32> %9, %10
+  %12 = select <16 x i1> %11, <16 x i32> %9, <16 x i32> %10
+  %13 = extractelement <16 x i32> %12, i32 0
+  ret i32 %13
+}
+
+define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+; X86-SSE2-LABEL: test_reduce_v32i16:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE2-NEXT:    pcmpgtw %xmm6, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X86-SSE2-NEXT:    pcmpgtw %xmm6, %xmm7
+; X86-SSE2-NEXT:    pand %xmm7, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm7
+; X86-SSE2-NEXT:    por %xmm1, %xmm7
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X86-SSE2-NEXT:    por %xmm0, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm7, %xmm1
+; X86-SSE2-NEXT:    por %xmm5, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    pcmpgtw %xmm2, %xmm3
+; X86-SSE2-NEXT:    pand %xmm3, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X86-SSE2-NEXT:    por %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pcmpgtw %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm3
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm3, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    psrld $16, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X86-SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm4
+; X86-SSE2-NEXT:    por %xmm2, %xmm4
+; X86-SSE2-NEXT:    movd %xmm4, %eax
+; X86-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v32i16:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminuw %xmm3, %xmm1
+; X86-SSE42-NEXT:    pminuw %xmm2, %xmm0
+; X86-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X86-SSE42-NEXT:    movd %xmm1, %eax
+; X86-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v32i16:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vmovd %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v32i16:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vmovd %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v32i16:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm5
+; X64-SSE2-NEXT:    pcmpgtw %xmm6, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm7
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm7
+; X64-SSE2-NEXT:    pcmpgtw %xmm6, %xmm7
+; X64-SSE2-NEXT:    pand %xmm7, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm3, %xmm7
+; X64-SSE2-NEXT:    por %xmm1, %xmm7
+; X64-SSE2-NEXT:    pand %xmm5, %xmm0
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X64-SSE2-NEXT:    por %xmm0, %xmm5
+; X64-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm7, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pcmpgtw %xmm0, %xmm1
+; X64-SSE2-NEXT:    pand %xmm1, %xmm5
+; X64-SSE2-NEXT:    pandn %xmm7, %xmm1
+; X64-SSE2-NEXT:    por %xmm5, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X64-SSE2-NEXT:    pcmpgtw %xmm2, %xmm3
+; X64-SSE2-NEXT:    pand %xmm3, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm3
+; X64-SSE2-NEXT:    por %xmm1, %xmm3
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; X64-SSE2-NEXT:    movdqa %xmm3, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X64-SSE2-NEXT:    pcmpgtw %xmm1, %xmm2
+; X64-SSE2-NEXT:    pand %xmm2, %xmm3
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X64-SSE2-NEXT:    por %xmm3, %xmm2
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X64-SSE2-NEXT:    psrld $16, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm4
+; X64-SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
+; X64-SSE2-NEXT:    pand %xmm4, %xmm2
+; X64-SSE2-NEXT:    pandn %xmm0, %xmm4
+; X64-SSE2-NEXT:    por %xmm2, %xmm4
+; X64-SSE2-NEXT:    movd %xmm4, %eax
+; X64-SSE2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v32i16:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminuw %xmm3, %xmm1
+; X64-SSE42-NEXT:    pminuw %xmm2, %xmm0
+; X64-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminuw %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pminuw %xmm0, %xmm1
+; X64-SSE42-NEXT:    movd %xmm1, %eax
+; X64-SSE42-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v32i16:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpminuw %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vmovd %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v32i16:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vmovd %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v32i16:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminuw %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovd %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ult <32 x i16> %a0, %1
+  %3  = select <32 x i1> %2, <32 x i16> %a0, <32 x i16> %1
+  %4  = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ult <32 x i16> %3, %4
+  %6  = select <32 x i1> %5, <32 x i16> %3, <32 x i16> %4
+  %7  = shufflevector <32 x i16> %6, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ult <32 x i16> %6, %7
+  %9  = select <32 x i1> %8, <32 x i16> %6, <32 x i16> %7
+  %10 = shufflevector <32 x i16> %9, <32 x i16> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ult <32 x i16> %9, %10
+  %12 = select <32 x i1> %11, <32 x i16> %9, <32 x i16> %10
+  %13 = shufflevector <32 x i16> %12, <32 x i16> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp ult <32 x i16> %12, %13
+  %15 = select <32 x i1> %14, <32 x i16> %12, <32 x i16> %13
+  %16 = extractelement <32 x i16> %15, i32 0
+  ret i16 %16
+}
+
+define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+; X86-SSE2-LABEL: test_reduce_v64i8:
+; X86-SSE2:       ## BB#0:
+; X86-SSE2-NEXT:    pminub %xmm3, %xmm1
+; X86-SSE2-NEXT:    pminub %xmm2, %xmm0
+; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    psrlw $8, %xmm0
+; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE42-LABEL: test_reduce_v64i8:
+; X86-SSE42:       ## BB#0:
+; X86-SSE42-NEXT:    pminub %xmm3, %xmm1
+; X86-SSE42-NEXT:    pminub %xmm2, %xmm0
+; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE42-NEXT:    psrld $16, %xmm1
+; X86-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $8, %xmm0
+; X86-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X86-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-SSE42-NEXT:    retl
+;
+; X86-AVX1-LABEL: test_reduce_v64i8:
+; X86-AVX1:       ## BB#0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-AVX2-LABEL: test_reduce_v64i8:
+; X86-AVX2:       ## BB#0:
+; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X86-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X86-AVX2-NEXT:    vzeroupper
+; X86-AVX2-NEXT:    retl
+;
+; X64-SSE2-LABEL: test_reduce_v64i8:
+; X64-SSE2:       ## BB#0:
+; X64-SSE2-NEXT:    pminub %xmm3, %xmm1
+; X64-SSE2-NEXT:    pminub %xmm2, %xmm0
+; X64-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrld $16, %xmm1
+; X64-SSE2-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE2-NEXT:    psrlw $8, %xmm0
+; X64-SSE2-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE2-NEXT:    retq
+;
+; X64-SSE42-LABEL: test_reduce_v64i8:
+; X64-SSE42:       ## BB#0:
+; X64-SSE42-NEXT:    pminub %xmm3, %xmm1
+; X64-SSE42-NEXT:    pminub %xmm2, %xmm0
+; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE42-NEXT:    psrld $16, %xmm1
+; X64-SSE42-NEXT:    pminub %xmm0, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $8, %xmm0
+; X64-SSE42-NEXT:    pminub %xmm1, %xmm0
+; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; X64-SSE42-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-SSE42-NEXT:    retq
+;
+; X64-AVX1-LABEL: test_reduce_v64i8:
+; X64-AVX1:       ## BB#0:
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2
+; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX1-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-AVX2-LABEL: test_reduce_v64i8:
+; X64-AVX2:       ## BB#0:
+; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX2-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX2-NEXT:    vzeroupper
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: test_reduce_v64i8:
+; X64-AVX512:       ## BB#0:
+; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax
+; X64-AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; X64-AVX512-NEXT:    vzeroupper
+; X64-AVX512-NEXT:    retq
+  %1  = shufflevector <64 x i8> %a0, <64 x i8> undef, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %2  = icmp ult <64 x i8> %a0, %1
+  %3  = select <64 x i1> %2, <64 x i8> %a0, <64 x i8> %1
+  %4  = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5  = icmp ult <64 x i8> %3, %4
+  %6  = select <64 x i1> %5, <64 x i8> %3, <64 x i8> %4
+  %7  = shufflevector <64 x i8> %6, <64 x i8> undef, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8  = icmp ult <64 x i8> %6, %7
+  %9  = select <64 x i1> %8, <64 x i8> %6, <64 x i8> %7
+  %10 = shufflevector <64 x i8> %9, <64 x i8> undef, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %11 = icmp ult <64 x i8> %9, %10
+  %12 = select <64 x i1> %11, <64 x i8> %9, <64 x i8> %10
+  %13 = shufflevector <64 x i8> %12, <64 x i8> undef, <64 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %14 = icmp ult <64 x i8> %12, %13
+  %15 = select <64 x i1> %14, <64 x i8> %12, <64 x i8> %13
+  %16 = shufflevector <64 x i8> %15, <64 x i8> undef, <64 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %17 = icmp ult <64 x i8> %15, %16
+  %18 = select <64 x i1> %17, <64 x i8> %15, <64 x i8> %16
+  %19 = extractelement <64 x i8> %18, i32 0
+  ret i8 %19
+}




More information about the llvm-commits mailing list