[llvm] r368079 - [X86] Remove uses of the -x86-experimental-vector-widening-legalization flag from test/CodeGen/X86/

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 6 13:12:20 PDT 2019


Author: ctopper
Date: Tue Aug  6 13:12:20 2019
New Revision: 368079

URL: http://llvm.org/viewvc/llvm-project?rev=368079&view=rev
Log:
[X86] Remove uses of the -x86-experimental-vector-widening-legalization flag from test/CodeGen/X86/

This flag is now the default behavior so we no longer need to
set it in tests.

Some redundant tests have been removed after verifying we have
an equivalent test that didn't use the flag.

Added:
    llvm/trunk/test/CodeGen/X86/vector-lzcnt-sub128.ll
Removed:
    llvm/trunk/test/CodeGen/X86/avx512-cvt-widen.ll
    llvm/trunk/test/CodeGen/X86/avx512-trunc-widen.ll
    llvm/trunk/test/CodeGen/X86/shrink_vmul-widen.ll
    llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll
    llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll
    llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll
    llvm/trunk/test/CodeGen/X86/vec_clz.ll
    llvm/trunk/test/CodeGen/X86/vec_fp_to_int-widen.ll
    llvm/trunk/test/CodeGen/X86/vec_int_to_fp-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-reduce-and-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-reduce-mul-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-reduce-or-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-reduce-smax-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-reduce-smin-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-reduce-umax-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-reduce-umin-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-reduce-xor-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-sext-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-trunc-math-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-trunc-packus-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-trunc-ssat-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-trunc-usat-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-trunc-widen.ll
    llvm/trunk/test/CodeGen/X86/vector-zext-widen.ll
Modified:
    llvm/trunk/test/CodeGen/X86/SwizzleShuff.ll
    llvm/trunk/test/CodeGen/X86/bswap-vector.ll
    llvm/trunk/test/CodeGen/X86/lower-bitcast.ll
    llvm/trunk/test/CodeGen/X86/masked_gather_scatter_widen.ll
    llvm/trunk/test/CodeGen/X86/pmulh.ll
    llvm/trunk/test/CodeGen/X86/vec_cast2.ll
    llvm/trunk/test/CodeGen/X86/vec_cast3.ll
    llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll
    llvm/trunk/test/CodeGen/X86/vector-trunc.ll
    llvm/trunk/test/CodeGen/X86/widen_cast-4.ll
    llvm/trunk/test/CodeGen/X86/widen_conversions.ll
    llvm/trunk/test/CodeGen/X86/widen_mul.ll

Modified: llvm/trunk/test/CodeGen/X86/SwizzleShuff.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/SwizzleShuff.ll?rev=368079&r1=368078&r2=368079&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/SwizzleShuff.ll (original)
+++ llvm/trunk/test/CodeGen/X86/SwizzleShuff.ll Tue Aug  6 13:12:20 2019
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
 
 ; Check that we perform a scalar XOR on i32.
 

Removed: llvm/trunk/test/CodeGen/X86/avx512-cvt-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-cvt-widen.ll?rev=368078&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-cvt-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-cvt-widen.ll (removed)
@@ -1,2645 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl  | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQNOVL --check-prefix=AVX512DQ
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
-
-
-define <16 x float> @sitof32(<16 x i32> %a) nounwind {
-; ALL-LABEL: sitof32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %b = sitofp <16 x i32> %a to <16 x float>
-  ret <16 x float> %b
-}
-
-define <8 x double> @sltof864(<8 x i64> %a) {
-; NODQ-LABEL: sltof864:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: sltof864:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: sltof864:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT:    retq
-  %b = sitofp <8 x i64> %a to <8 x double>
-  ret <8 x double> %b
-}
-
-define <4 x double> @slto4f64(<4 x i64> %a) {
-; NODQ-LABEL: slto4f64:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: slto4f64:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtqq2pd %ymm0, %ymm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: slto4f64:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT:    retq
-  %b = sitofp <4 x i64> %a to <4 x double>
-  ret <4 x double> %b
-}
-
-define <2 x double> @slto2f64(<2 x i64> %a) {
-; NODQ-LABEL: slto2f64:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: slto2f64:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtqq2pd %xmm0, %xmm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: slto2f64:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %b = sitofp <2 x i64> %a to <2 x double>
-  ret <2 x double> %b
-}
-
-define <2 x float> @sltof2f32(<2 x i64> %a) {
-; NOVLDQ-LABEL: sltof2f32:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NOVLDQ-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; NOVLDQ-NEXT:    vmovq %xmm0, %rax
-; NOVLDQ-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; NOVLDQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: sltof2f32:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: sltof2f32:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; VLNODQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VLNODQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: sltof2f32:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; DQNOVL-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %b = sitofp <2 x i64> %a to <2 x float>
-  ret <2 x float>%b
-}
-
-define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
-; NODQ-LABEL: slto4f32_mem:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vmovdqu (%rdi), %xmm0
-; NODQ-NEXT:    vmovdqu 16(%rdi), %xmm1
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: slto4f32_mem:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtqq2psy (%rdi), %xmm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: slto4f32_mem:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vmovups (%rdi), %ymm0
-; DQNOVL-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %a1 = load <4 x i64>, <4 x i64>* %a, align 8
-  %b = sitofp <4 x i64> %a1 to <4 x float>
-  ret <4 x float>%b
-}
-
-define <4 x i64> @f64to4sl(<4 x double> %a) {
-; NODQ-LABEL: f64to4sl:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; NODQ-NEXT:    vcvttsd2si %xmm1, %rax
-; NODQ-NEXT:    vmovq %rax, %xmm2
-; NODQ-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; NODQ-NEXT:    vcvttsd2si %xmm1, %rax
-; NODQ-NEXT:    vmovq %rax, %xmm1
-; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; NODQ-NEXT:    vcvttsd2si %xmm0, %rax
-; NODQ-NEXT:    vmovq %rax, %xmm2
-; NODQ-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; NODQ-NEXT:    vcvttsd2si %xmm0, %rax
-; NODQ-NEXT:    vmovq %rax, %xmm0
-; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; NODQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: f64to4sl:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttpd2qq %ymm0, %ymm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: f64to4sl:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; DQNOVL-NEXT:    vcvttpd2qq %zmm0, %zmm0
-; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT:    retq
-  %b = fptosi <4 x double> %a to <4 x i64>
-  ret <4 x i64> %b
-}
-
-define <4 x i64> @f32to4sl(<4 x float> %a) {
-; NODQ-LABEL: f32to4sl:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; NODQ-NEXT:    vcvttss2si %xmm1, %rax
-; NODQ-NEXT:    vmovq %rax, %xmm1
-; NODQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; NODQ-NEXT:    vcvttss2si %xmm2, %rax
-; NODQ-NEXT:    vmovq %rax, %xmm2
-; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; NODQ-NEXT:    vcvttss2si %xmm0, %rax
-; NODQ-NEXT:    vmovq %rax, %xmm2
-; NODQ-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; NODQ-NEXT:    vcvttss2si %xmm0, %rax
-; NODQ-NEXT:    vmovq %rax, %xmm0
-; NODQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; NODQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: f32to4sl:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: f32to4sl:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; DQNOVL-NEXT:    vcvttps2qq %ymm0, %zmm0
-; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT:    retq
-  %b = fptosi <4 x float> %a to <4 x i64>
-  ret <4 x i64> %b
-}
-
-define <4 x float> @slto4f32(<4 x i64> %a) {
-; NODQ-LABEL: slto4f32:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT:    vzeroupper
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: slto4f32:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
-; VLDQ-NEXT:    vzeroupper
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: slto4f32:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; DQNOVL-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %b = sitofp <4 x i64> %a to <4 x float>
-  ret <4 x float> %b
-}
-
-define <4 x float> @ulto4f32(<4 x i64> %a) {
-; NODQ-LABEL: ulto4f32:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT:    vzeroupper
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: ulto4f32:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
-; VLDQ-NEXT:    vzeroupper
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: ulto4f32:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; DQNOVL-NEXT:    vcvtuqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %b = uitofp <4 x i64> %a to <4 x float>
-  ret <4 x float> %b
-}
-
-define <8 x double> @ulto8f64(<8 x i64> %a) {
-; NODQ-LABEL: ulto8f64:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
-; NODQ-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
-; NODQ-NEXT:    vpsrlq $32, %zmm0, %zmm0
-; NODQ-NEXT:    vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; NODQ-NEXT:    vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; NODQ-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: ulto8f64:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: ulto8f64:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvtuqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT:    retq
-  %b = uitofp <8 x i64> %a to <8 x double>
-  ret <8 x double> %b
-}
-
-define <16 x double> @ulto16f64(<16 x i64> %a) {
-; NODQ-LABEL: ulto16f64:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
-; NODQ-NEXT:    vpandq %zmm2, %zmm0, %zmm3
-; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; NODQ-NEXT:    vporq %zmm4, %zmm3, %zmm3
-; NODQ-NEXT:    vpsrlq $32, %zmm0, %zmm0
-; NODQ-NEXT:    vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; NODQ-NEXT:    vporq %zmm5, %zmm0, %zmm0
-; NODQ-NEXT:    vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; NODQ-NEXT:    vsubpd %zmm6, %zmm0, %zmm0
-; NODQ-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
-; NODQ-NEXT:    vpandq %zmm2, %zmm1, %zmm2
-; NODQ-NEXT:    vporq %zmm4, %zmm2, %zmm2
-; NODQ-NEXT:    vpsrlq $32, %zmm1, %zmm1
-; NODQ-NEXT:    vporq %zmm5, %zmm1, %zmm1
-; NODQ-NEXT:    vsubpd %zmm6, %zmm1, %zmm1
-; NODQ-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: ulto16f64:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
-; VLDQ-NEXT:    vcvtuqq2pd %zmm1, %zmm1
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: ulto16f64:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvtuqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT:    vcvtuqq2pd %zmm1, %zmm1
-; DQNOVL-NEXT:    retq
-  %b = uitofp <16 x i64> %a to <16 x double>
-  ret <16 x double> %b
-}
-
-define <16 x i32> @f64to16si(<16 x float> %a) nounwind {
-; ALL-LABEL: f64to16si:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %b = fptosi <16 x float> %a to <16 x i32>
-  ret <16 x i32> %b
-}
-
-define <16 x i8> @f32to16sc(<16 x float> %f) {
-; ALL-LABEL: f32to16sc:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT:    vpmovdb %zmm0, %xmm0
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %res = fptosi <16 x float> %f to <16 x i8>
-  ret <16 x i8> %res
-}
-
-define <16 x i16> @f32to16ss(<16 x float> %f) {
-; ALL-LABEL: f32to16ss:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT:    vpmovdw %zmm0, %ymm0
-; ALL-NEXT:    retq
-  %res = fptosi <16 x float> %f to <16 x i16>
-  ret <16 x i16> %res
-}
-
-define <16 x i32> @f32to16ui(<16 x float> %a) nounwind {
-; ALL-LABEL: f32to16ui:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %b = fptoui <16 x float> %a to <16 x i32>
-  ret <16 x i32> %b
-}
-
-define <16 x i8> @f32to16uc(<16 x float> %f) {
-; ALL-LABEL: f32to16uc:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT:    vpmovdb %zmm0, %xmm0
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %res = fptoui <16 x float> %f to <16 x i8>
-  ret <16 x i8> %res
-}
-
-define <16 x i16> @f32to16us(<16 x float> %f) {
-; ALL-LABEL: f32to16us:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT:    vpmovdw %zmm0, %ymm0
-; ALL-NEXT:    retq
-  %res = fptoui <16 x float> %f to <16 x i16>
-  ret <16 x i16> %res
-}
-
-define <8 x i32> @f32to8ui(<8 x float> %a) nounwind {
-; NOVL-LABEL: f32to8ui:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; NOVL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: f32to8ui:
-; VL:       # %bb.0:
-; VL-NEXT:    vcvttps2udq %ymm0, %ymm0
-; VL-NEXT:    retq
-  %b = fptoui <8 x float> %a to <8 x i32>
-  ret <8 x i32> %b
-}
-
-define <4 x i32> @f32to4ui(<4 x float> %a) nounwind {
-; NOVL-LABEL: f32to4ui:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; NOVL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVL-NEXT:    vzeroupper
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: f32to4ui:
-; VL:       # %bb.0:
-; VL-NEXT:    vcvttps2udq %xmm0, %xmm0
-; VL-NEXT:    retq
-  %b = fptoui <4 x float> %a to <4 x i32>
-  ret <4 x i32> %b
-}
-
-define <8 x i32> @f64to8ui(<8 x double> %a) nounwind {
-; ALL-LABEL: f64to8ui:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; ALL-NEXT:    retq
-  %b = fptoui <8 x double> %a to <8 x i32>
-  ret <8 x i32> %b
-}
-
-define <8 x i16> @f64to8us(<8 x double> %f) {
-; NOVL-LABEL: f64to8us:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT:    vpmovdw %zmm0, %ymm0
-; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; NOVL-NEXT:    vzeroupper
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: f64to8us:
-; VL:       # %bb.0:
-; VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT:    vpmovdw %ymm0, %xmm0
-; VL-NEXT:    vzeroupper
-; VL-NEXT:    retq
-  %res = fptoui <8 x double> %f to <8 x i16>
-  ret <8 x i16> %res
-}
-
-define <8 x i8> @f64to8uc(<8 x double> %f) {
-; NOVL-LABEL: f64to8uc:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT:    vpmovdb %zmm0, %xmm0
-; NOVL-NEXT:    vzeroupper
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: f64to8uc:
-; VL:       # %bb.0:
-; VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT:    vpmovdb %ymm0, %xmm0
-; VL-NEXT:    vzeroupper
-; VL-NEXT:    retq
-  %res = fptoui <8 x double> %f to <8 x i8>
-  ret <8 x i8> %res
-}
-
-define <4 x i32> @f64to4ui(<4 x double> %a) nounwind {
-; NOVL-LABEL: f64to4ui:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; NOVL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; NOVL-NEXT:    vzeroupper
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: f64to4ui:
-; VL:       # %bb.0:
-; VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
-; VL-NEXT:    vzeroupper
-; VL-NEXT:    retq
-  %b = fptoui <4 x double> %a to <4 x i32>
-  ret <4 x i32> %b
-}
-
-define <8 x double> @sito8f64(<8 x i32> %a) {
-; ALL-LABEL: sito8f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT:    retq
-  %b = sitofp <8 x i32> %a to <8 x double>
-  ret <8 x double> %b
-}
-define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: i32to8f64_mask:
-; KNL:       # %bb.0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; KNL-NEXT:    retq
-;
-; VLBW-LABEL: i32to8f64_mask:
-; VLBW:       # %bb.0:
-; VLBW-NEXT:    kmovd %edi, %k1
-; VLBW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; VLBW-NEXT:    retq
-;
-; VLNOBW-LABEL: i32to8f64_mask:
-; VLNOBW:       # %bb.0:
-; VLNOBW-NEXT:    kmovw %edi, %k1
-; VLNOBW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; VLNOBW-NEXT:    retq
-;
-; DQNOVL-LABEL: i32to8f64_mask:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    kmovw %edi, %k1
-; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; DQNOVL-NEXT:    retq
-;
-; AVX512BW-LABEL: i32to8f64_mask:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; AVX512BW-NEXT:    retq
-  %1 = bitcast i8 %c to <8 x i1>
-  %2 = sitofp <8 x i32> %b to <8 x double>
-  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
-  ret <8 x double> %3
-}
-define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: sito8f64_maskz:
-; KNL:       # %bb.0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    retq
-;
-; VLBW-LABEL: sito8f64_maskz:
-; VLBW:       # %bb.0:
-; VLBW-NEXT:    kmovd %edi, %k1
-; VLBW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; VLBW-NEXT:    retq
-;
-; VLNOBW-LABEL: sito8f64_maskz:
-; VLNOBW:       # %bb.0:
-; VLNOBW-NEXT:    kmovw %edi, %k1
-; VLNOBW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; VLNOBW-NEXT:    retq
-;
-; DQNOVL-LABEL: sito8f64_maskz:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    kmovw %edi, %k1
-; DQNOVL-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    retq
-;
-; AVX512BW-LABEL: sito8f64_maskz:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-  %1 = bitcast i8 %b to <8 x i1>
-  %2 = sitofp <8 x i32> %a to <8 x double>
-  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
-  ret <8 x double> %3
-}
-
-define <8 x i32> @f64to8si(<8 x double> %a) {
-; ALL-LABEL: f64to8si:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; ALL-NEXT:    retq
-  %b = fptosi <8 x double> %a to <8 x i32>
-  ret <8 x i32> %b
-}
-
-define <8 x i16> @f64to8ss(<8 x double> %f) {
-; NOVL-LABEL: f64to8ss:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT:    vpmovdw %zmm0, %ymm0
-; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; NOVL-NEXT:    vzeroupper
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: f64to8ss:
-; VL:       # %bb.0:
-; VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT:    vpmovdw %ymm0, %xmm0
-; VL-NEXT:    vzeroupper
-; VL-NEXT:    retq
-  %res = fptosi <8 x double> %f to <8 x i16>
-  ret <8 x i16> %res
-}
-
-define <8 x i8> @f64to8sc(<8 x double> %f) {
-; NOVL-LABEL: f64to8sc:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT:    vpmovdb %zmm0, %xmm0
-; NOVL-NEXT:    vzeroupper
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: f64to8sc:
-; VL:       # %bb.0:
-; VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT:    vpmovdb %ymm0, %xmm0
-; VL-NEXT:    vzeroupper
-; VL-NEXT:    retq
-  %res = fptosi <8 x double> %f to <8 x i8>
-  ret <8 x i8> %res
-}
-
-define <4 x i32> @f64to4si(<4 x double> %a) {
-; ALL-LABEL: f64to4si:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %b = fptosi <4 x double> %a to <4 x i32>
-  ret <4 x i32> %b
-}
-
-define <16 x float> @f64to16f32(<16 x double> %b) nounwind {
-; ALL-LABEL: f64to16f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtpd2ps %zmm0, %ymm0
-; ALL-NEXT:    vcvtpd2ps %zmm1, %ymm1
-; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %a = fptrunc <16 x double> %b to <16 x float>
-  ret <16 x float> %a
-}
-
-define <4 x float> @f64to4f32(<4 x double> %b) {
-; ALL-LABEL: f64to4f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %a = fptrunc <4 x double> %b to <4 x float>
-  ret <4 x float> %a
-}
-
-define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
-; NOVLDQ-LABEL: f64to4f32_mask:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    vpslld $31, %xmm1, %xmm1
-; NOVLDQ-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NOVLDQ-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; NOVLDQ-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT:    vzeroupper
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: f64to4f32_mask:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vpslld $31, %xmm1, %xmm1
-; VLDQ-NEXT:    vpmovd2m %xmm1, %k1
-; VLDQ-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
-; VLDQ-NEXT:    vzeroupper
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: f64to4f32_mask:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vpslld $31, %xmm1, %xmm1
-; VLNODQ-NEXT:    vptestmd %xmm1, %xmm1, %k1
-; VLNODQ-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
-; VLNODQ-NEXT:    vzeroupper
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: f64to4f32_mask:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vpslld $31, %xmm1, %xmm1
-; DQNOVL-NEXT:    vpmovd2m %zmm1, %k1
-; DQNOVL-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; DQNOVL-NEXT:    vmovaps %zmm0, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %a = fptrunc <4 x double> %b to <4 x float>
-  %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
-  ret <4 x float> %c
-}
-
-define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
-; ALL-LABEL: f64tof32_inreg:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm1, %xmm0
-; ALL-NEXT:    retq
-  %ext = extractelement <2 x double> %a0, i32 0
-  %cvt = fptrunc double %ext to float
-  %res = insertelement <4 x float> %a1, float %cvt, i32 0
-  ret <4 x float> %res
-}
-
-define <8 x double> @f32to8f64(<8 x float> %b) nounwind {
-; ALL-LABEL: f32to8f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtps2pd %ymm0, %zmm0
-; ALL-NEXT:    retq
-  %a = fpext <8 x float> %b to <8 x double>
-  ret <8 x double> %a
-}
-
-define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) {
-; NOVL-LABEL: f32to4f64_mask:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    # kill: def $ymm2 killed $ymm2 def $zmm2
-; NOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVL-NEXT:    vcvtps2pd %xmm0, %ymm0
-; NOVL-NEXT:    vcmpltpd %zmm2, %zmm1, %k1
-; NOVL-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
-; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: f32to4f64_mask:
-; VL:       # %bb.0:
-; VL-NEXT:    vcmpltpd %ymm2, %ymm1, %k1
-; VL-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
-; VL-NEXT:    retq
-  %a = fpext <4 x float> %b to <4 x double>
-  %mask = fcmp ogt <4 x double> %a1, %b1
-  %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer
-  ret <4 x double> %c
-}
-
-define <4 x double> @f32to4f64_mask_load(<4 x float>* %p, <4 x double> %b1, <4 x double> %a1) {
-; NOVL-LABEL: f32to4f64_mask_load:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; NOVL-NEXT:    vcvtps2pd (%rdi), %ymm2
-; NOVL-NEXT:    vcmpltpd %zmm1, %zmm0, %k1
-; NOVL-NEXT:    vmovapd %zmm2, %zmm0 {%k1} {z}
-; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: f32to4f64_mask_load:
-; VL:       # %bb.0:
-; VL-NEXT:    vcmpltpd %ymm1, %ymm0, %k1
-; VL-NEXT:    vcvtps2pd (%rdi), %ymm0 {%k1} {z}
-; VL-NEXT:    retq
-  %b = load <4 x float>, <4 x float>* %p
-  %a = fpext <4 x float> %b to <4 x double>
-  %mask = fcmp ogt <4 x double> %a1, %b1
-  %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer
-  ret <4 x double> %c
-}
-
-define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
-; ALL-LABEL: f32tof64_inreg:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0
-; ALL-NEXT:    retq
-  %ext = extractelement <4 x float> %a1, i32 0
-  %cvt = fpext float %ext to double
-  %res = insertelement <2 x double> %a0, double %cvt, i32 0
-  ret <2 x double> %res
-}
-
-define double @sltof64_load(i64* nocapture %e) {
-; ALL-LABEL: sltof64_load:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0
-; ALL-NEXT:    retq
-entry:
-  %tmp1 = load i64, i64* %e, align 8
-  %conv = sitofp i64 %tmp1 to double
-  ret double %conv
-}
-
-define double @sitof64_load(i32* %e) {
-; ALL-LABEL: sitof64_load:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0
-; ALL-NEXT:    retq
-entry:
-  %tmp1 = load i32, i32* %e, align 4
-  %conv = sitofp i32 %tmp1 to double
-  ret double %conv
-}
-
-define float @sitof32_load(i32* %e) {
-; ALL-LABEL: sitof32_load:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0
-; ALL-NEXT:    retq
-entry:
-  %tmp1 = load i32, i32* %e, align 4
-  %conv = sitofp i32 %tmp1 to float
-  ret float %conv
-}
-
-define float @sltof32_load(i64* %e) {
-; ALL-LABEL: sltof32_load:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0
-; ALL-NEXT:    retq
-entry:
-  %tmp1 = load i64, i64* %e, align 8
-  %conv = sitofp i64 %tmp1 to float
-  ret float %conv
-}
-
-define void @f32tof64_loadstore() {
-; ALL-LABEL: f32tof64_loadstore:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ALL-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    retq
-entry:
-  %f = alloca float, align 4
-  %d = alloca double, align 8
-  %tmp = load float, float* %f, align 4
-  %conv = fpext float %tmp to double
-  store double %conv, double* %d, align 8
-  ret void
-}
-
-define void @f64tof32_loadstore() nounwind uwtable {
-; ALL-LABEL: f64tof32_loadstore:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
-; ALL-NEXT:    retq
-entry:
-  %f = alloca float, align 4
-  %d = alloca double, align 8
-  %tmp = load double, double* %d, align 8
-  %conv = fptrunc double %tmp to float
-  store float %conv, float* %f, align 4
-  ret void
-}
-
-define double @long_to_double(i64 %x) {
-; ALL-LABEL: long_to_double:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vmovq %rdi, %xmm0
-; ALL-NEXT:    retq
-   %res = bitcast i64 %x to double
-   ret double %res
-}
-
-define i64 @double_to_long(double %x) {
-; ALL-LABEL: double_to_long:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vmovq %xmm0, %rax
-; ALL-NEXT:    retq
-   %res = bitcast double %x to i64
-   ret i64 %res
-}
-
-define float @int_to_float(i32 %x) {
-; ALL-LABEL: int_to_float:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vmovd %edi, %xmm0
-; ALL-NEXT:    retq
-   %res = bitcast i32 %x to float
-   ret float %res
-}
-
-define i32 @float_to_int(float %x) {
-; ALL-LABEL: float_to_int:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vmovd %xmm0, %eax
-; ALL-NEXT:    retq
-   %res = bitcast float %x to i32
-   ret i32 %res
-}
-
-define <16 x double> @uito16f64(<16 x i32> %a) nounwind {
-; ALL-LABEL: uito16f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtudq2pd %ymm0, %zmm2
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vcvtudq2pd %ymm0, %zmm1
-; ALL-NEXT:    vmovaps %zmm2, %zmm0
-; ALL-NEXT:    retq
-  %b = uitofp <16 x i32> %a to <16 x double>
-  ret <16 x double> %b
-}
-
-define <8 x float> @slto8f32(<8 x i64> %a) {
-; NODQ-LABEL: slto8f32:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm0
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: slto8f32:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: slto8f32:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT:    retq
-  %b = sitofp <8 x i64> %a to <8 x float>
-  ret <8 x float> %b
-}
-
-define <16 x float> @slto16f32(<16 x i64> %a) {
-; NODQ-LABEL: slto16f32:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: slto16f32:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; VLDQ-NEXT:    vcvtqq2ps %zmm1, %ymm1
-; VLDQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: slto16f32:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT:    vcvtqq2ps %zmm1, %ymm1
-; DQNOVL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; DQNOVL-NEXT:    retq
-  %b = sitofp <16 x i64> %a to <16 x float>
-  ret <16 x float> %b
-}
-
-define <8 x double> @slto8f64(<8 x i64> %a) {
-; NODQ-LABEL: slto8f64:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: slto8f64:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: slto8f64:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT:    retq
-  %b = sitofp <8 x i64> %a to <8 x double>
-  ret <8 x double> %b
-}
-
-define <16 x double> @slto16f64(<16 x i64> %a) {
-; NODQ-LABEL: slto16f64:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: slto16f64:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; VLDQ-NEXT:    vcvtqq2pd %zmm1, %zmm1
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: slto16f64:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT:    vcvtqq2pd %zmm1, %zmm1
-; DQNOVL-NEXT:    retq
-  %b = sitofp <16 x i64> %a to <16 x double>
-  ret <16 x double> %b
-}
-
-define <8 x float> @ulto8f32(<8 x i64> %a) {
-; NODQ-LABEL: ulto8f32:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm1
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm3
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm3
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm0
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: ulto8f32:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: ulto8f32:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvtuqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT:    retq
-  %b = uitofp <8 x i64> %a to <8 x float>
-  ret <8 x float> %b
-}
-
-define <16 x float> @ulto16f32(<16 x i64> %a) {
-; NODQ-LABEL: ulto16f32:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; NODQ-NEXT:    vmovq %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm1
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovq %xmm2, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; NODQ-NEXT:    vmovq %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm0
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
-; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: ulto16f32:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
-; VLDQ-NEXT:    vcvtuqq2ps %zmm1, %ymm1
-; VLDQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: ulto16f32:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvtuqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT:    vcvtuqq2ps %zmm1, %ymm1
-; DQNOVL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; DQNOVL-NEXT:    retq
-  %b = uitofp <16 x i64> %a to <16 x float>
-  ret <16 x float> %b
-}
-
-define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: uito8f64_mask:
-; KNL:       # %bb.0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; KNL-NEXT:    retq
-;
-; VLBW-LABEL: uito8f64_mask:
-; VLBW:       # %bb.0:
-; VLBW-NEXT:    kmovd %edi, %k1
-; VLBW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; VLBW-NEXT:    retq
-;
-; VLNOBW-LABEL: uito8f64_mask:
-; VLNOBW:       # %bb.0:
-; VLNOBW-NEXT:    kmovw %edi, %k1
-; VLNOBW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; VLNOBW-NEXT:    retq
-;
-; DQNOVL-LABEL: uito8f64_mask:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    kmovw %edi, %k1
-; DQNOVL-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; DQNOVL-NEXT:    retq
-;
-; AVX512BW-LABEL: uito8f64_mask:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; AVX512BW-NEXT:    retq
-  %1 = bitcast i8 %c to <8 x i1>
-  %2 = uitofp <8 x i32> %b to <8 x double>
-  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
-  ret <8 x double> %3
-}
-define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: uito8f64_maskz:
-; KNL:       # %bb.0:
-; KNL-NEXT:    kmovw %edi, %k1
-; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL-NEXT:    retq
-;
-; VLBW-LABEL: uito8f64_maskz:
-; VLBW:       # %bb.0:
-; VLBW-NEXT:    kmovd %edi, %k1
-; VLBW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; VLBW-NEXT:    retq
-;
-; VLNOBW-LABEL: uito8f64_maskz:
-; VLNOBW:       # %bb.0:
-; VLNOBW-NEXT:    kmovw %edi, %k1
-; VLNOBW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; VLNOBW-NEXT:    retq
-;
-; DQNOVL-LABEL: uito8f64_maskz:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    kmovw %edi, %k1
-; DQNOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    retq
-;
-; AVX512BW-LABEL: uito8f64_maskz:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-  %1 = bitcast i8 %b to <8 x i1>
-  %2 = uitofp <8 x i32> %a to <8 x double>
-  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
-  ret <8 x double> %3
-}
-
-define <4 x double> @uito4f64(<4 x i32> %a) nounwind {
-; NOVL-LABEL: uito4f64:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; NOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: uito4f64:
-; VL:       # %bb.0:
-; VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
-; VL-NEXT:    retq
-  %b = uitofp <4 x i32> %a to <4 x double>
-  ret <4 x double> %b
-}
-
-define <16 x float> @uito16f32(<16 x i32> %a) nounwind {
-; ALL-LABEL: uito16f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %b = uitofp <16 x i32> %a to <16 x float>
-  ret <16 x float> %b
-}
-
-define <8 x double> @uito8f64(<8 x i32> %a) {
-; ALL-LABEL: uito8f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; ALL-NEXT:    retq
-  %b = uitofp <8 x i32> %a to <8 x double>
-  ret <8 x double> %b
-}
-
-define <8 x float> @uito8f32(<8 x i32> %a) nounwind {
-; NOVL-LABEL: uito8f32:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; NOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: uito8f32:
-; VL:       # %bb.0:
-; VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
-; VL-NEXT:    retq
-  %b = uitofp <8 x i32> %a to <8 x float>
-  ret <8 x float> %b
-}
-
-define <4 x float> @uito4f32(<4 x i32> %a) nounwind {
-; NOVL-LABEL: uito4f32:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; NOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVL-NEXT:    vzeroupper
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: uito4f32:
-; VL:       # %bb.0:
-; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; VL-NEXT:    retq
-  %b = uitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %b
-}
-
-define i32 @fptosi(float %a) nounwind {
-; ALL-LABEL: fptosi:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvttss2si %xmm0, %eax
-; ALL-NEXT:    retq
-  %b = fptosi float %a to i32
-  ret i32 %b
-}
-
-define i32 @fptoui(float %a) nounwind {
-; ALL-LABEL: fptoui:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvttss2usi %xmm0, %eax
-; ALL-NEXT:    retq
-  %b = fptoui float %a to i32
-  ret i32 %b
-}
-
-define float @uitof32(i32 %a) nounwind {
-; ALL-LABEL: uitof32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtusi2ss %edi, %xmm0, %xmm0
-; ALL-NEXT:    retq
-  %b = uitofp i32 %a to float
-  ret float %b
-}
-
-define double @uitof64(i32 %a) nounwind {
-; ALL-LABEL: uitof64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtusi2sd %edi, %xmm0, %xmm0
-; ALL-NEXT:    retq
-  %b = uitofp i32 %a to double
-  ret double %b
-}
-
-define <16 x float> @sbto16f32(<16 x i32> %a) {
-; NODQ-LABEL: sbto16f32:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; NODQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; NODQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NODQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: sbto16f32:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vpmovd2m %zmm0, %k0
-; VLDQ-NEXT:    vpmovm2d %k0, %zmm0
-; VLDQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: sbto16f32:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k0
-; DQNOVL-NEXT:    vpmovm2d %k0, %zmm0
-; DQNOVL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; DQNOVL-NEXT:    retq
-  %mask = icmp slt <16 x i32> %a, zeroinitializer
-  %1 = sitofp <16 x i1> %mask to <16 x float>
-  ret <16 x float> %1
-}
-
-define <16 x float> @scto16f32(<16 x i8> %a) {
-; ALL-LABEL: scto16f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %1 = sitofp <16 x i8> %a to <16 x float>
-  ret <16 x float> %1
-}
-
-define <16 x float> @ssto16f32(<16 x i16> %a) {
-; ALL-LABEL: ssto16f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %1 = sitofp <16 x i16> %a to <16 x float>
-  ret <16 x float> %1
-}
-
-define <8 x double> @ssto16f64(<8 x i16> %a) {
-; ALL-LABEL: ssto16f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovsxwd %xmm0, %ymm0
-; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT:    retq
-  %1 = sitofp <8 x i16> %a to <8 x double>
-  ret <8 x double> %1
-}
-
-define <8 x double> @scto8f64(<8 x i8> %a) {
-; ALL-LABEL: scto8f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovsxbd %xmm0, %ymm0
-; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT:    retq
-  %1 = sitofp <8 x i8> %a to <8 x double>
-  ret <8 x double> %1
-}
-
-define <16 x double> @scto16f64(<16 x i8> %a) {
-; ALL-LABEL: scto16f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovsxbd %xmm0, %zmm1
-; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; ALL-NEXT:    retq
-  %b = sitofp <16 x i8> %a to <16 x double>
-  ret <16 x double> %b
-}
-
-define <16 x double> @sbto16f64(<16 x double> %a) {
-; NODQ-LABEL: sbto16f64:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; NODQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k0
-; NODQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
-; NODQ-NEXT:    kunpckbw %k0, %k1, %k1
-; NODQ-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NODQ-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; NODQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; NODQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: sbto16f64:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k0
-; VLDQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
-; VLDQ-NEXT:    kunpckbw %k0, %k1, %k0
-; VLDQ-NEXT:    vpmovm2d %k0, %zmm1
-; VLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; VLDQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; VLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: sbto16f64:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; DQNOVL-NEXT:    vcmpltpd %zmm0, %zmm2, %k0
-; DQNOVL-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
-; DQNOVL-NEXT:    kunpckbw %k0, %k1, %k0
-; DQNOVL-NEXT:    vpmovm2d %k0, %zmm1
-; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; DQNOVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; DQNOVL-NEXT:    retq
-  %cmpres = fcmp ogt <16 x double> %a, zeroinitializer
-  %1 = sitofp <16 x i1> %cmpres to <16 x double>
-  ret <16 x double> %1
-}
-
-define <8 x double> @sbto8f64(<8 x double> %a) {
-; NOVLDQ-LABEL: sbto8f64:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; NOVLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
-; NOVLDQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: sbto8f64:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
-; VLDQ-NEXT:    vpmovm2d %k0, %ymm0
-; VLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: sbto8f64:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; VLNODQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
-; VLNODQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
-; VLNODQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; VLNODQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: sbto8f64:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; DQNOVL-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
-; DQNOVL-NEXT:    vpmovm2d %k0, %zmm0
-; DQNOVL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; DQNOVL-NEXT:    retq
-  %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
-  %1 = sitofp <8 x i1> %cmpres to <8 x double>
-  ret <8 x double> %1
-}
-
-define <8 x float> @sbto8f32(<8 x float> %a) {
-; ALL-LABEL: sbto8f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
-; ALL-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; ALL-NEXT:    retq
-  %cmpres = fcmp ogt <8 x float> %a, zeroinitializer
-  %1 = sitofp <8 x i1> %cmpres to <8 x float>
-  ret <8 x float> %1
-}
-
-define <4 x float> @sbto4f32(<4 x float> %a) {
-; ALL-LABEL: sbto4f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; ALL-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; ALL-NEXT:    retq
-  %cmpres = fcmp ogt <4 x float> %a, zeroinitializer
-  %1 = sitofp <4 x i1> %cmpres to <4 x float>
-  ret <4 x float> %1
-}
-
-define <4 x double> @sbto4f64(<4 x double> %a) {
-; NOVL-LABEL: sbto4f64:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
-; NOVL-NEXT:    vpmovqd %zmm0, %ymm0
-; NOVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; NOVL-NEXT:    retq
-;
-; VLDQ-LABEL: sbto4f64:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; VLDQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
-; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
-; VLDQ-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: sbto4f64:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; VLNODQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k1
-; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; VLNODQ-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; VLNODQ-NEXT:    retq
-  %cmpres = fcmp ogt <4 x double> %a, zeroinitializer
-  %1 = sitofp <4 x i1> %cmpres to <4 x double>
-  ret <4 x double> %1
-}
-
-define <2 x float> @sbto2f32(<2 x float> %a) {
-; ALL-LABEL: sbto2f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; ALL-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; ALL-NEXT:    retq
-  %cmpres = fcmp ogt <2 x float> %a, zeroinitializer
-  %1 = sitofp <2 x i1> %cmpres to <2 x float>
-  ret <2 x float> %1
-}
-
-define <2 x double> @sbto2f64(<2 x double> %a) {
-; NOVL-LABEL: sbto2f64:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
-; NOVL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; NOVL-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; NOVL-NEXT:    retq
-;
-; VLDQ-LABEL: sbto2f64:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; VLDQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
-; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
-; VLDQ-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: sbto2f64:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; VLNODQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k1
-; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; VLNODQ-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VLNODQ-NEXT:    retq
-  %cmpres = fcmp ogt <2 x double> %a, zeroinitializer
-  %1 = sitofp <2 x i1> %cmpres to <2 x double>
-  ret <2 x double> %1
-}
-
-define <16 x float> @ucto16f32(<16 x i8> %a) {
-; ALL-LABEL: ucto16f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %b = uitofp <16 x i8> %a to <16 x float>
-  ret <16 x float>%b
-}
-
-define <8 x double> @ucto8f64(<8 x i8> %a) {
-; ALL-LABEL: ucto8f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT:    retq
-  %b = uitofp <8 x i8> %a to <8 x double>
-  ret <8 x double> %b
-}
-
-define <16 x float> @swto16f32(<16 x i16> %a) {
-; ALL-LABEL: swto16f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovsxwd %ymm0, %zmm0
-; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %b = sitofp <16 x i16> %a to <16 x float>
-  ret <16 x float> %b
-}
-
-define <8 x double> @swto8f64(<8 x i16> %a) {
-; ALL-LABEL: swto8f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovsxwd %xmm0, %ymm0
-; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT:    retq
-  %b = sitofp <8 x i16> %a to <8 x double>
-  ret <8 x double> %b
-}
-
-define <16 x double> @swto16f64(<16 x i16> %a) {
-; ALL-LABEL: swto16f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovsxwd %ymm0, %zmm1
-; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; ALL-NEXT:    retq
-  %b = sitofp <16 x i16> %a to <16 x double>
-  ret <16 x double> %b
-}
-
-define <16 x double> @ucto16f64(<16 x i8> %a) {
-; ALL-LABEL: ucto16f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; ALL-NEXT:    retq
-  %b = uitofp <16 x i8> %a to <16 x double>
-  ret <16 x double> %b
-}
-
-define <16 x float> @uwto16f32(<16 x i16> %a) {
-; ALL-LABEL: uwto16f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %b = uitofp <16 x i16> %a to <16 x float>
-  ret <16 x float> %b
-}
-
-define <8 x double> @uwto8f64(<8 x i16> %a) {
-; ALL-LABEL: uwto8f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT:    retq
-  %b = uitofp <8 x i16> %a to <8 x double>
-  ret <8 x double> %b
-}
-
-define <16 x double> @uwto16f64(<16 x i16> %a) {
-; ALL-LABEL: uwto16f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; ALL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; ALL-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; ALL-NEXT:    retq
-  %b = uitofp <16 x i16> %a to <16 x double>
-  ret <16 x double> %b
-}
-
-define <16 x float> @sito16f32(<16 x i32> %a) {
-; ALL-LABEL: sito16f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %b = sitofp <16 x i32> %a to <16 x float>
-  ret <16 x float> %b
-}
-
-define <16 x double> @sito16f64(<16 x i32> %a) {
-; ALL-LABEL: sito16f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm2
-; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm1
-; ALL-NEXT:    vmovaps %zmm2, %zmm0
-; ALL-NEXT:    retq
-  %b = sitofp <16 x i32> %a to <16 x double>
-  ret <16 x double> %b
-}
-
-define <16 x float> @usto16f32(<16 x i16> %a) {
-; ALL-LABEL: usto16f32:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; ALL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT:    retq
-  %b = uitofp <16 x i16> %a to <16 x float>
-  ret <16 x float> %b
-}
-
-define <16 x float> @ubto16f32(<16 x i32> %a) {
-; NODQ-LABEL: ubto16f32:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; NODQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; NODQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NODQ-NEXT:    vpsrld $31, %zmm0, %zmm0
-; NODQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: ubto16f32:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vpmovd2m %zmm0, %k0
-; VLDQ-NEXT:    vpmovm2d %k0, %zmm0
-; VLDQ-NEXT:    vpsrld $31, %zmm0, %zmm0
-; VLDQ-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: ubto16f32:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k0
-; DQNOVL-NEXT:    vpmovm2d %k0, %zmm0
-; DQNOVL-NEXT:    vpsrld $31, %zmm0, %zmm0
-; DQNOVL-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; DQNOVL-NEXT:    retq
-  %mask = icmp slt <16 x i32> %a, zeroinitializer
-  %1 = uitofp <16 x i1> %mask to <16 x float>
-  ret <16 x float> %1
-}
-
-define <16 x double> @ubto16f64(<16 x i32> %a) {
-; NODQ-LABEL: ubto16f64:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; NODQ-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; NODQ-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NODQ-NEXT:    vpsrld $31, %zmm0, %zmm1
-; NODQ-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; NODQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; NODQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: ubto16f64:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vpmovd2m %zmm0, %k0
-; VLDQ-NEXT:    vpmovm2d %k0, %zmm0
-; VLDQ-NEXT:    vpsrld $31, %zmm0, %zmm1
-; VLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; VLDQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; VLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: ubto16f64:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k0
-; DQNOVL-NEXT:    vpmovm2d %k0, %zmm0
-; DQNOVL-NEXT:    vpsrld $31, %zmm0, %zmm1
-; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm0
-; DQNOVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; DQNOVL-NEXT:    vcvtdq2pd %ymm1, %zmm1
-; DQNOVL-NEXT:    retq
-  %mask = icmp slt <16 x i32> %a, zeroinitializer
-  %1 = uitofp <16 x i1> %mask to <16 x double>
-  ret <16 x double> %1
-}
-
-define <8 x float> @ubto8f32(<8 x i32> %a) {
-; NOVL-LABEL: ubto8f32:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
-; NOVL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216]
-; NOVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: ubto8f32:
-; VL:       # %bb.0:
-; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VL-NEXT:    vpcmpgtd %ymm0, %ymm1, %ymm0
-; VL-NEXT:    vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; VL-NEXT:    retq
-  %mask = icmp slt <8 x i32> %a, zeroinitializer
-  %1 = uitofp <8 x i1> %mask to <8 x float>
-  ret <8 x float> %1
-}
-
-define <8 x double> @ubto8f64(<8 x i32> %a) {
-; ALL-LABEL: ubto8f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpsrld $31, %ymm0, %ymm0
-; ALL-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT:    retq
-  %mask = icmp slt <8 x i32> %a, zeroinitializer
-  %1 = uitofp <8 x i1> %mask to <8 x double>
-  ret <8 x double> %1
-}
-
-define <4 x float> @ubto4f32(<4 x i32> %a) {
-; NOVL-LABEL: ubto4f32:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; NOVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216]
-; NOVL-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: ubto4f32:
-; VL:       # %bb.0:
-; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; VL-NEXT:    vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; VL-NEXT:    retq
-  %mask = icmp slt <4 x i32> %a, zeroinitializer
-  %1 = uitofp <4 x i1> %mask to <4 x float>
-  ret <4 x float> %1
-}
-
-define <4 x double> @ubto4f64(<4 x i32> %a) {
-; ALL-LABEL: ubto4f64:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vpsrld $31, %xmm0, %xmm0
-; ALL-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; ALL-NEXT:    retq
-  %mask = icmp slt <4 x i32> %a, zeroinitializer
-  %1 = uitofp <4 x i1> %mask to <4 x double>
-  ret <4 x double> %1
-}
-
-define <2 x float> @ubto2f32(<2 x i32> %a) {
-; NOVL-LABEL: ubto2f32:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; NOVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216]
-; NOVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: ubto2f32:
-; VL:       # %bb.0:
-; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; VL-NEXT:    vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; VL-NEXT:    retq
-  %mask = icmp ne <2 x i32> %a, zeroinitializer
-  %1 = uitofp <2 x i1> %mask to <2 x float>
-  ret <2 x float> %1
-}
-
-define <2 x double> @ubto2f64(<2 x i32> %a) {
-; NOVL-LABEL: ubto2f64:
-; NOVL:       # %bb.0:
-; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; NOVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; NOVL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; NOVL-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; NOVL-NEXT:    retq
-;
-; VL-LABEL: ubto2f64:
-; VL:       # %bb.0:
-; VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VL-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; VL-NEXT:    vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; VL-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VL-NEXT:    retq
-  %mask = icmp ne <2 x i32> %a, zeroinitializer
-  %1 = uitofp <2 x i1> %mask to <2 x double>
-  ret <2 x double> %1
-}
-
-define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) {
-; NOVLDQ-LABEL: test_2f64toub:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; NOVLDQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; NOVLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT:    vzeroupper
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_2f64toub:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttpd2udq %xmm0, %xmm0
-; VLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_2f64toub:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttpd2udq %xmm0, %xmm0
-; VLNODQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_2f64toub:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; DQNOVL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; DQNOVL-NEXT:    vpslld $31, %ymm0, %ymm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %mask = fptoui <2 x double> %a to <2 x i1>
-  %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
-  ret <2 x i64> %select
-}
-
-define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) {
-; NOVLDQ-LABEL: test_4f64toub:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVLDQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; NOVLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_4f64toub:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; VLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_4f64toub:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; VLNODQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_4f64toub:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; DQNOVL-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; DQNOVL-NEXT:    vpslld $31, %xmm0, %xmm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT:    retq
-  %mask = fptoui <4 x double> %a to <4 x i1>
-  %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
-  ret <4 x i64> %select
-}
-
-define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) {
-; NOVLDQ-LABEL: test_8f64toub:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; NOVLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_8f64toub:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; VLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
-; VLDQ-NEXT:    vpmovd2m %ymm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_8f64toub:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; VLNODQ-NEXT:    vpslld $31, %ymm0, %ymm0
-; VLNODQ-NEXT:    vptestmd %ymm0, %ymm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_8f64toub:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; DQNOVL-NEXT:    vpslld $31, %ymm0, %ymm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    retq
-  %mask = fptoui <8 x double> %a to <8 x i1>
-  %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
-  ret <8 x i64> %select
-}
-
-define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) {
-; NOVLDQ-LABEL: test_2f32toub:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; NOVLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT:    vzeroupper
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_2f32toub:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; VLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_2f32toub:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; VLNODQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_2f32toub:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; DQNOVL-NEXT:    vcvttps2dq %xmm0, %xmm0
-; DQNOVL-NEXT:    vpslld $31, %xmm0, %xmm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %mask = fptoui <2 x float> %a to <2 x i1>
-  %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
-  ret <2 x i64> %select
-}
-
-define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) {
-; NOVLDQ-LABEL: test_4f32toub:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; NOVLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_4f32toub:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; VLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_4f32toub:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; VLNODQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_4f32toub:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; DQNOVL-NEXT:    vcvttps2dq %xmm0, %xmm0
-; DQNOVL-NEXT:    vpslld $31, %xmm0, %xmm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT:    retq
-  %mask = fptoui <4 x float> %a to <4 x i1>
-  %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
-  ret <4 x i64> %select
-}
-
-define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) {
-; NOVLDQ-LABEL: test_8f32toub:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    vcvttps2dq %ymm0, %ymm0
-; NOVLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_8f32toub:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttps2dq %ymm0, %ymm0
-; VLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
-; VLDQ-NEXT:    vpmovd2m %ymm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_8f32toub:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttps2dq %ymm0, %ymm0
-; VLNODQ-NEXT:    vpslld $31, %ymm0, %ymm0
-; VLNODQ-NEXT:    vptestmd %ymm0, %ymm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_8f32toub:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvttps2dq %ymm0, %ymm0
-; DQNOVL-NEXT:    vpslld $31, %ymm0, %ymm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    retq
-  %mask = fptoui <8 x float> %a to <8 x i1>
-  %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
-  ret <8 x i64> %select
-}
-
-define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) {
-; NODQ-LABEL: test_16f32toub:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vcvttps2dq %zmm0, %zmm0
-; NODQ-NEXT:    vpslld $31, %zmm0, %zmm0
-; NODQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NODQ-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_16f32toub:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttps2dq %zmm0, %zmm0
-; VLDQ-NEXT:    vpslld $31, %zmm0, %zmm0
-; VLDQ-NEXT:    vpmovd2m %zmm0, %k1
-; VLDQ-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_16f32toub:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvttps2dq %zmm0, %zmm0
-; DQNOVL-NEXT:    vpslld $31, %zmm0, %zmm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    retq
-  %mask = fptoui <16 x float> %a to <16 x i1>
-  %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
-  ret <16 x i32> %select
-}
-
-define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) {
-; NOVLDQ-LABEL: test_2f64tosb:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVLDQ-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; NOVLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT:    vzeroupper
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_2f64tosb:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; VLDQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_2f64tosb:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; VLNODQ-NEXT:    vpslld $31, %xmm0, %xmm0
-; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_2f64tosb:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; DQNOVL-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; DQNOVL-NEXT:    vpslld $31, %xmm0, %xmm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %mask = fptosi <2 x double> %a to <2 x i1>
-  %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
-  ret <2 x i64> %select
-}
-
-define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) {
-; NOVLDQ-LABEL: test_4f64tosb:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVLDQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_4f64tosb:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_4f64tosb:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_4f64tosb:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; DQNOVL-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT:    retq
-  %mask = fptosi <4 x double> %a to <4 x i1>
-  %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
-  ret <4 x i64> %select
-}
-
-define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) {
-; NOVLDQ-LABEL: test_8f64tosb:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_8f64tosb:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; VLDQ-NEXT:    vpmovd2m %ymm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_8f64tosb:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; VLNODQ-NEXT:    vptestmd %ymm0, %ymm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_8f64tosb:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    retq
-  %mask = fptosi <8 x double> %a to <8 x i1>
-  %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
-  ret <8 x i64> %select
-}
-
-define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) {
-; NOVLDQ-LABEL: test_2f32tosb:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT:    vzeroupper
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_2f32tosb:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_2f32tosb:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_2f32tosb:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; DQNOVL-NEXT:    vcvttps2dq %xmm0, %xmm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %mask = fptosi <2 x float> %a to <2 x i1>
-  %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
-  ret <2 x i64> %select
-}
-
-define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) {
-; NOVLDQ-LABEL: test_4f32tosb:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_4f32tosb:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; VLDQ-NEXT:    vpmovd2m %xmm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_4f32tosb:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttps2dq %xmm0, %xmm0
-; VLNODQ-NEXT:    vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_4f32tosb:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
-; DQNOVL-NEXT:    vcvttps2dq %xmm0, %xmm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT:    retq
-  %mask = fptosi <4 x float> %a to <4 x i1>
-  %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
-  ret <4 x i64> %select
-}
-
-define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) {
-; NOVLDQ-LABEL: test_8f32tosb:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    vcvttps2dq %ymm0, %ymm0
-; NOVLDQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_8f32tosb:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttps2dq %ymm0, %ymm0
-; VLDQ-NEXT:    vpmovd2m %ymm0, %k1
-; VLDQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_8f32tosb:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vcvttps2dq %ymm0, %ymm0
-; VLNODQ-NEXT:    vptestmd %ymm0, %ymm0, %k1
-; VLNODQ-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_8f32tosb:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvttps2dq %ymm0, %ymm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    retq
-  %mask = fptosi <8 x float> %a to <8 x i1>
-  %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
-  ret <8 x i64> %select
-}
-
-define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) {
-; NODQ-LABEL: test_16f32tosb:
-; NODQ:       # %bb.0:
-; NODQ-NEXT:    vcvttps2dq %zmm0, %zmm0
-; NODQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
-; NODQ-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; NODQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_16f32tosb:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vcvttps2dq %zmm0, %zmm0
-; VLDQ-NEXT:    vpmovd2m %zmm0, %k1
-; VLDQ-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_16f32tosb:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    vcvttps2dq %zmm0, %zmm0
-; DQNOVL-NEXT:    vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    retq
-  %mask = fptosi <16 x float> %a to <16 x i1>
-  %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
-  ret <16 x i32> %select
-}
-
-define <2 x double> @test_sito2f64_mask_load(<2 x i32> *%a, <2 x i64> %c) {
-; SSE-LABEL: sitofp_load_2i32_to_2f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2pd (%rdi), %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_2i32_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2pd (%rdi), %xmm0
-; AVX-NEXT:    retq
-; NOVLDQ-LABEL: test_sito2f64_mask_load:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; NOVLDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; NOVLDQ-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
-; NOVLDQ-NEXT:    vcvtdq2pd (%rdi), %xmm0
-; NOVLDQ-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT:    vzeroupper
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_sito2f64_mask_load:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vpmovq2m %xmm0, %k1
-; VLDQ-NEXT:    vcvtdq2pd (%rdi), %xmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_sito2f64_mask_load:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VLNODQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %k1
-; VLNODQ-NEXT:    vcvtdq2pd (%rdi), %xmm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_sito2f64_mask_load:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; DQNOVL-NEXT:    vpmovq2m %zmm0, %k1
-; DQNOVL-NEXT:    vcvtdq2pd (%rdi), %xmm0
-; DQNOVL-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %mask = icmp slt <2 x i64> %c, zeroinitializer
-  %ld = load <2 x i32>, <2 x i32> *%a
-  %cvt = sitofp <2 x i32> %ld to <2 x double>
-  %sel = select <2 x i1> %mask, <2 x double> %cvt, <2 x double> zeroinitializer
-  ret <2 x double> %sel
-}
-
-define <2 x double> @test_uito2f64_mask_load(<2 x i32> *%a, <2 x i64> %c) {
-; SSE-LABEL: sitofp_load_2i32_to_2f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2pd (%rdi), %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_2i32_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2pd (%rdi), %xmm0
-; AVX-NEXT:    retq
-; NOVLDQ-LABEL: test_uito2f64_mask_load:
-; NOVLDQ:       # %bb.0:
-; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; NOVLDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; NOVLDQ-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
-; NOVLDQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; NOVLDQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; NOVLDQ-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT:    vzeroupper
-; NOVLDQ-NEXT:    retq
-;
-; VLDQ-LABEL: test_uito2f64_mask_load:
-; VLDQ:       # %bb.0:
-; VLDQ-NEXT:    vpmovq2m %xmm0, %k1
-; VLDQ-NEXT:    vcvtudq2pd (%rdi), %xmm0 {%k1} {z}
-; VLDQ-NEXT:    retq
-;
-; VLNODQ-LABEL: test_uito2f64_mask_load:
-; VLNODQ:       # %bb.0:
-; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VLNODQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %k1
-; VLNODQ-NEXT:    vcvtudq2pd (%rdi), %xmm0 {%k1} {z}
-; VLNODQ-NEXT:    retq
-;
-; DQNOVL-LABEL: test_uito2f64_mask_load:
-; DQNOVL:       # %bb.0:
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; DQNOVL-NEXT:    vpmovq2m %zmm0, %k1
-; DQNOVL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; DQNOVL-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; DQNOVL-NEXT:    vmovapd %zmm0, %zmm0 {%k1} {z}
-; DQNOVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT:    vzeroupper
-; DQNOVL-NEXT:    retq
-  %mask = icmp slt <2 x i64> %c, zeroinitializer
-  %ld = load <2 x i32>, <2 x i32> *%a
-  %cvt = uitofp <2 x i32> %ld to <2 x double>
-  %sel = select <2 x i1> %mask, <2 x double> %cvt, <2 x double> zeroinitializer
-  ret <2 x double> %sel
-}

Removed: llvm/trunk/test/CodeGen/X86/avx512-trunc-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-trunc-widen.ll?rev=368078&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-trunc-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-trunc-widen.ll (removed)
@@ -1,1035 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,SKX
-
- attributes #0 = { nounwind }
-
-define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 {
-; ALL-LABEL: trunc_16x32_to_16x8:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovdb %zmm0, %xmm0
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x = trunc <16 x i32> %i to <16 x i8>
-  ret <16 x i8> %x
-}
-
-define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 {
-; ALL-LABEL: trunc_8x64_to_8x16:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovqw %zmm0, %xmm0
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x = trunc <8 x i64> %i to <8 x i16>
-  ret <8 x i16> %x
-}
-
-define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) #0 {
-; ALL-LABEL: trunc_v16i32_to_v16i16:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovdw %zmm0, %ymm0
-; ALL-NEXT:    retq
-  %1 = trunc <16 x i32> %x to <16 x i16>
-  ret <16 x i16> %1
-}
-
-define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 {
-; ALL-LABEL: trunc_qb_512:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovqb %zmm0, %xmm0
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x = trunc <8 x i64> %i to <8 x i8>
-  ret <8 x i8> %x
-}
-
-define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 {
-; ALL-LABEL: trunc_qb_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovqb %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-    %x = trunc <8 x i64> %i to <8 x i8>
-    store <8 x i8> %x, <8 x i8>* %res
-    ret void
-}
-
-define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 {
-; KNL-LABEL: trunc_qb_256:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpmovqb %zmm0, %xmm0
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_qb_256:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovqb %ymm0, %xmm0
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x = trunc <4 x i64> %i to <4 x i8>
-  ret <4 x i8> %x
-}
-
-define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
-; KNL-LABEL: trunc_qb_256_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpmovqb %zmm0, %xmm0
-; KNL-NEXT:    vmovd %xmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_qb_256_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovqb %ymm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-    %x = trunc <4 x i64> %i to <4 x i8>
-    store <4 x i8> %x, <4 x i8>* %res
-    ret void
-}
-
-define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 {
-; ALL-LABEL: trunc_qb_128:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; ALL-NEXT:    retq
-  %x = trunc <2 x i64> %i to <2 x i8>
-  ret <2 x i8> %x
-}
-
-define void @trunc_qb_128_mem(<2 x i64> %i, <2 x i8>* %res) #0 {
-; KNL-LABEL: trunc_qb_128_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; KNL-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_qb_128_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovqb %xmm0, (%rdi)
-; SKX-NEXT:    retq
-    %x = trunc <2 x i64> %i to <2 x i8>
-    store <2 x i8> %x, <2 x i8>* %res
-    ret void
-}
-
-define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 {
-; ALL-LABEL: trunc_qw_512:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovqw %zmm0, %xmm0
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x = trunc <8 x i64> %i to <8 x i16>
-  ret <8 x i16> %x
-}
-
-define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 {
-; ALL-LABEL: trunc_qw_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovqw %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-    %x = trunc <8 x i64> %i to <8 x i16>
-    store <8 x i16> %x, <8 x i16>* %res
-    ret void
-}
-
-define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 {
-; KNL-LABEL: trunc_qw_256:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpmovqw %zmm0, %xmm0
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_qw_256:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovqw %ymm0, %xmm0
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x = trunc <4 x i64> %i to <4 x i16>
-  ret <4 x i16> %x
-}
-
-define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 {
-; KNL-LABEL: trunc_qw_256_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpmovqw %zmm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_qw_256_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovqw %ymm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-    %x = trunc <4 x i64> %i to <4 x i16>
-    store <4 x i16> %x, <4 x i16>* %res
-    ret void
-}
-
-define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
-; KNL-LABEL: trunc_qw_128:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_qw_128:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; SKX-NEXT:    retq
-  %x = trunc <2 x i64> %i to <2 x i16>
-  ret <2 x i16> %x
-}
-
-define void @trunc_qw_128_mem(<2 x i64> %i, <2 x i16>* %res) #0 {
-; KNL-LABEL: trunc_qw_128_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; KNL-NEXT:    vmovd %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_qw_128_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovqw %xmm0, (%rdi)
-; SKX-NEXT:    retq
-    %x = trunc <2 x i64> %i to <2 x i16>
-    store <2 x i16> %x, <2 x i16>* %res
-    ret void
-}
-
-define <8 x i32> @trunc_qd_512(<8 x i64> %i) #0 {
-; ALL-LABEL: trunc_qd_512:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovqd %zmm0, %ymm0
-; ALL-NEXT:    retq
-  %x = trunc <8 x i64> %i to <8 x i32>
-  ret <8 x i32> %x
-}
-
-define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 {
-; ALL-LABEL: trunc_qd_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovqd %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-    %x = trunc <8 x i64> %i to <8 x i32>
-    store <8 x i32> %x, <8 x i32>* %res
-    ret void
-}
-
-define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 {
-; KNL-LABEL: trunc_qd_256:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_qd_256:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovqd %ymm0, %xmm0
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x = trunc <4 x i64> %i to <4 x i32>
-  ret <4 x i32> %x
-}
-
-define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 {
-; KNL-LABEL: trunc_qd_256_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vmovdqa %xmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_qd_256_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovqd %ymm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-    %x = trunc <4 x i64> %i to <4 x i32>
-    store <4 x i32> %x, <4 x i32>* %res
-    ret void
-}
-
-define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 {
-; ALL-LABEL: trunc_qd_128:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; ALL-NEXT:    retq
-  %x = trunc <2 x i64> %i to <2 x i32>
-  ret <2 x i32> %x
-}
-
-define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 {
-; KNL-LABEL: trunc_qd_128_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL-NEXT:    vmovlps %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_qd_128_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovqd %xmm0, (%rdi)
-; SKX-NEXT:    retq
-    %x = trunc <2 x i64> %i to <2 x i32>
-    store <2 x i32> %x, <2 x i32>* %res
-    ret void
-}
-
-define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 {
-; ALL-LABEL: trunc_db_512:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovdb %zmm0, %xmm0
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x = trunc <16 x i32> %i to <16 x i8>
-  ret <16 x i8> %x
-}
-
-define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 {
-; ALL-LABEL: trunc_db_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovdb %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-    %x = trunc <16 x i32> %i to <16 x i8>
-    store <16 x i8> %x, <16 x i8>* %res
-    ret void
-}
-
-define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 {
-; KNL-LABEL: trunc_db_256:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_db_256:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovdb %ymm0, %xmm0
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x = trunc <8 x i32> %i to <8 x i8>
-  ret <8 x i8> %x
-}
-
-define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
-; KNL-LABEL: trunc_db_256_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_db_256_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovdb %ymm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-    %x = trunc <8 x i32> %i to <8 x i8>
-    store <8 x i8> %x, <8 x i8>* %res
-    ret void
-}
-
-define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 {
-; ALL-LABEL: trunc_db_128:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; ALL-NEXT:    retq
-  %x = trunc <4 x i32> %i to <4 x i8>
-  ret <4 x i8> %x
-}
-
-define void @trunc_db_128_mem(<4 x i32> %i, <4 x i8>* %res) #0 {
-; KNL-LABEL: trunc_db_128_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; KNL-NEXT:    vmovd %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_db_128_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovdb %xmm0, (%rdi)
-; SKX-NEXT:    retq
-    %x = trunc <4 x i32> %i to <4 x i8>
-    store <4 x i8> %x, <4 x i8>* %res
-    ret void
-}
-
-define <16 x i16> @trunc_dw_512(<16 x i32> %i) #0 {
-; ALL-LABEL: trunc_dw_512:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovdw %zmm0, %ymm0
-; ALL-NEXT:    retq
-  %x = trunc <16 x i32> %i to <16 x i16>
-  ret <16 x i16> %x
-}
-
-define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 {
-; ALL-LABEL: trunc_dw_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovdw %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-    %x = trunc <16 x i32> %i to <16 x i16>
-    store <16 x i16> %x, <16 x i16>* %res
-    ret void
-}
-
-define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 {
-; KNL-LABEL: trunc_dw_256:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 killed $ymm0
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_dw_256:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovdw %ymm0, %xmm0
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x = trunc <8 x i32> %i to <8 x i16>
-  ret <8 x i16> %x
-}
-
-define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 {
-; KNL-LABEL: trunc_dw_256_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL-NEXT:    vmovdqa %xmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_dw_256_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovdw %ymm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-    %x = trunc <8 x i32> %i to <8 x i16>
-    store <8 x i16> %x, <8 x i16>* %res
-    ret void
-}
-
-define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 {
-; KNL-LABEL: trunc_dw_128_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; KNL-NEXT:    vmovq %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_dw_128_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovdw %xmm0, (%rdi)
-; SKX-NEXT:    retq
-    %x = trunc <4 x i32> %i to <4 x i16>
-    store <4 x i16> %x, <4 x i16>* %res
-    ret void
-}
-
-define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
-; KNL-LABEL: trunc_wb_512:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; KNL-NEXT:    vpmovdb %zmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_wb_512:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovwb %zmm0, %ymm0
-; SKX-NEXT:    retq
-  %x = trunc <32 x i16> %i to <32 x i8>
-  ret <32 x i8> %x
-}
-
-define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
-; KNL-LABEL: trunc_wb_512_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; KNL-NEXT:    vpmovdb %zmm1, 16(%rdi)
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_wb_512_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovwb %zmm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-    %x = trunc <32 x i16> %i to <32 x i8>
-    store <32 x i8> %x, <32 x i8>* %res
-    ret void
-}
-
-define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 {
-; KNL-LABEL: trunc_wb_256:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_wb_256:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovwb %ymm0, %xmm0
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x = trunc <16 x i16> %i to <16 x i8>
-  ret <16 x i8> %x
-}
-
-define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 {
-; KNL-LABEL: trunc_wb_256_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_wb_256_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovwb %ymm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-    %x = trunc <16 x i16> %i to <16 x i8>
-    store <16 x i8> %x, <16 x i8>* %res
-    ret void
-}
-
-define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 {
-; ALL-LABEL: trunc_wb_128:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; ALL-NEXT:    retq
-  %x = trunc <8 x i16> %i to <8 x i8>
-  ret <8 x i8> %x
-}
-
-define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
-; KNL-LABEL: trunc_wb_128_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; KNL-NEXT:    vmovq %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: trunc_wb_128_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovwb %xmm0, (%rdi)
-; SKX-NEXT:    retq
-    %x = trunc <8 x i16> %i to <8 x i8>
-    store <8 x i8> %x, <8 x i8>* %res
-    ret void
-}
-
-
-define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
-; KNL-LABEL: usat_trunc_wb_256_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_wb_256_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovuswb %ymm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <16 x i16> %x5 to <16 x i8>
-  store <16 x i8> %x6, <16 x i8>* %res, align 1
-  ret void
-}
-
-define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
-; KNL-LABEL: usat_trunc_wb_256:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_wb_256:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovuswb %ymm0, %xmm0
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <16 x i16> %x5 to <16 x i8>
-  ret <16 x i8> %x6
-}
-
-define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) {
-; KNL-LABEL: usat_trunc_wb_128_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpminuw {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_wb_128_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmovuswb %xmm0, (%rdi)
-; SKX-NEXT:    retq
-  %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <8 x i16> %x5 to <8 x i8>
-  store <8 x i8> %x6, <8 x i8>* %res, align 1
-  ret void
-}
-
-define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
-; ALL-LABEL: usat_trunc_db_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovusdb %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x3 = icmp ult <16 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x6 = trunc <16 x i32> %x5 to <16 x i8>
-  store <16 x i8> %x6, <16 x i8>* %res, align 1
-  ret void
-}
-
-define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
-; ALL-LABEL: usat_trunc_qb_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovusqb %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x3 = icmp ult <8 x i64> %i, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
-  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
-  %x6 = trunc <8 x i64> %x5 to <8 x i8>
-  store <8 x i8> %x6, <8 x i8>* %res, align 1
-  ret void
-}
-
-define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
-; ALL-LABEL: usat_trunc_qd_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovusqd %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x3 = icmp ult <8 x i64> %i, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  %x6 = trunc <8 x i64> %x5 to <8 x i32>
-  store <8 x i32> %x6, <8 x i32>* %res, align 1
-  ret void
-}
-
-define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
-; ALL-LABEL: usat_trunc_qw_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovusqw %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x3 = icmp ult <8 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
-  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
-  %x6 = trunc <8 x i64> %x5 to <8 x i16>
-  store <8 x i16> %x6, <8 x i16>* %res, align 1
-  ret void
-}
-
-define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) {
-; ALL-LABEL: usat_trunc_db_1024:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
-; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
-; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    retq
-  %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x6 = trunc <32 x i32> %x5 to <32 x i8>
-  ret <32 x i8> %x6
-}
-
-define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
-; ALL-LABEL: usat_trunc_db_1024_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
-; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
-; ALL-NEXT:    vmovdqu %xmm1, 16(%rdi)
-; ALL-NEXT:    vmovdqu %xmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x6 = trunc <32 x i32> %x5 to <32 x i8>
-  store <32 x i8>%x6, <32 x i8>* %p, align 1
-  ret void
-}
-
-define <16 x i16> @usat_trunc_dw_512(<16 x i32> %i) {
-; ALL-LABEL: usat_trunc_dw_512:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovusdw %zmm0, %ymm0
-; ALL-NEXT:    retq
-  %x3 = icmp ult <16 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-  %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-  %x6 = trunc <16 x i32> %x5 to <16 x i16>
-  ret <16 x i16> %x6
-}
-
-define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) {
-; ALL-LABEL: usat_trunc_wb_128:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpminuw {{.*}}(%rip), %xmm0, %xmm0
-; ALL-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; ALL-NEXT:    retq
-  %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <8 x i16> %x5 to <8 x i8>
-  ret <8 x i8>%x6
-}
-
-define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
-; ALL-LABEL: usat_trunc_qw_1024:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpmovusqw %zmm0, %xmm0
-; ALL-NEXT:    vpmovusqw %zmm1, %xmm1
-; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    retq
-  %x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
-  %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
-  %x6 = trunc <16 x i64> %x5 to <16 x i16>
-  ret <16 x i16> %x6
-}
-
-define <16 x i8> @usat_trunc_db_256(<8 x i32> %x) {
-; KNL-LABEL: usat_trunc_db_256:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; KNL-NEXT:    vpminud %ymm1, %ymm0, %ymm0
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_db_256:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; SKX-NEXT:    vpmovdb %ymm0, %xmm0
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %tmp1 = icmp ult <8 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %tmp2 = select <8 x i1> %tmp1, <8 x i32> %x, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %tmp3 = trunc <8 x i32> %tmp2 to <8 x i8>
-  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %tmp4
-}
-
-
-
-; Tests for the following unsigned saturation pattern:
-
-; %a = icmp sgt %x, C1
-; %b = select %a, %x, C2
-; %c = icmp slt %b, C2
-; %d = select %c, %b, C2
-; %res = trunc %d
-
-
-define void @smax_usat_trunc_wb_256_mem1(<16 x i16> %i, <16 x i8>* %res) {
-; KNL-LABEL: smax_usat_trunc_wb_256_mem1:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: smax_usat_trunc_wb_256_mem1:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; SKX-NEXT:    vpmovuswb %ymm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-  %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-  %x3 = icmp slt <16 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <16 x i16> %x5 to <16 x i8>
-  store <16 x i8> %x6, <16 x i8>* %res, align 1
-  ret void
-}
-
-; Test for smax(smin(x, C2), C1).
-define void @smax_usat_trunc_wb_256_mem2(<16 x i16> %i, <16 x i8>* %res) {
-; KNL-LABEL: smax_usat_trunc_wb_256_mem2:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: smax_usat_trunc_wb_256_mem2:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; SKX-NEXT:    vpmovuswb %ymm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x1 = icmp slt <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x3 = icmp sgt <16 x i16> %x2, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-  %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-  %x6 = trunc <16 x i16> %x5 to <16 x i8>
-  store <16 x i8> %x6, <16 x i8>* %res, align 1
-  ret void
-}
-
-define <16 x i8> @smax_usat_trunc_wb_256(<16 x i16> %i) {
-; KNL-LABEL: smax_usat_trunc_wb_256:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: smax_usat_trunc_wb_256:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; SKX-NEXT:    vpmovuswb %ymm0, %xmm0
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-  %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-  %x3 = icmp slt <16 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <16 x i16> %x5 to <16 x i8>
-  ret <16 x i8> %x6
-  }
-
-define void @smax_usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) {
-; KNL-LABEL: smax_usat_trunc_wb_128_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    vpminsw {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: smax_usat_trunc_wb_128_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
-; SKX-NEXT:    vpmovuswb %xmm0, (%rdi)
-; SKX-NEXT:    retq
-  %x1 = icmp sgt <8 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-  %x2 = select <8 x i1> %x1, <8 x i16> %i, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-  %x3 = icmp slt <8 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <8 x i1> %x3, <8 x i16> %x2, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <8 x i16> %x5 to <8 x i8>
-  store <8 x i8> %x6, <8 x i8>* %res, align 1
-  ret void
-}
-
-define void @smax_usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
-; ALL-LABEL: smax_usat_trunc_db_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
-; ALL-NEXT:    vpmovusdb %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x1 = icmp sgt <16 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %x3 = icmp slt <16 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x6 = trunc <16 x i32> %x5 to <16 x i8>
-  store <16 x i8> %x6, <16 x i8>* %res, align 1
-  ret void
-}
-
-define void @smax_usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
-; ALL-LABEL: smax_usat_trunc_qb_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
-; ALL-NEXT:    vpmovusqb %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
-  %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
-  %x3 = icmp slt <8 x i64> %x2, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
-  %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
-  %x6 = trunc <8 x i64> %x5 to <8 x i8>
-  store <8 x i8> %x6, <8 x i8>* %res, align 1
-  ret void
-}
-
-define void @smax_usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
-; ALL-LABEL: smax_usat_trunc_qd_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
-; ALL-NEXT:    vpmovusqd %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
-  %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
-  %x3 = icmp slt <8 x i64> %x2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  %x6 = trunc <8 x i64> %x5 to <8 x i32>
-  store <8 x i32> %x6, <8 x i32>* %res, align 1
-  ret void
-}
-
-define void @smax_usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
-; ALL-LABEL: smax_usat_trunc_qw_512_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
-; ALL-NEXT:    vpmovusqw %zmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
-  %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
-  %x3 = icmp slt <8 x i64> %x2, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
-  %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
-  %x6 = trunc <8 x i64> %x5 to <8 x i16>
-  store <8 x i16> %x6, <8 x i16>* %res, align 1
-  ret void
-}
-
-define <32 x i8> @smax_usat_trunc_db_1024(<32 x i32> %i) {
-; ALL-LABEL: smax_usat_trunc_db_1024:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
-; ALL-NEXT:    vpmaxsd %zmm2, %zmm0, %zmm0
-; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
-; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
-; ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    retq
-  %x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %x3 = icmp slt <32 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x6 = trunc <32 x i32> %x5 to <32 x i8>
-  ret <32 x i8> %x6
-}
-
-define void @smax_usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
-; ALL-LABEL: smax_usat_trunc_db_1024_mem:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT:    vpmaxsd %zmm2, %zmm1, %zmm1
-; ALL-NEXT:    vpmaxsd %zmm2, %zmm0, %zmm0
-; ALL-NEXT:    vpmovusdb %zmm0, %xmm0
-; ALL-NEXT:    vpmovusdb %zmm1, %xmm1
-; ALL-NEXT:    vmovdqu %xmm1, 16(%rdi)
-; ALL-NEXT:    vmovdqu %xmm0, (%rdi)
-; ALL-NEXT:    vzeroupper
-; ALL-NEXT:    retq
-  %x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %x3 = icmp slt <32 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x6 = trunc <32 x i32> %x5 to <32 x i8>
-  store <32 x i8>%x6, <32 x i8>* %p, align 1
-  ret void
-}
-
-define <16 x i16> @smax_usat_trunc_dw_512(<16 x i32> %i) {
-; ALL-LABEL: smax_usat_trunc_dw_512:
-; ALL:       ## %bb.0:
-; ALL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm0
-; ALL-NEXT:    vpmovusdw %zmm0, %ymm0
-; ALL-NEXT:    retq
-  %x1 = icmp sgt <16 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
-  %x3 = icmp slt <16 x i32> %x2, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-  %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-  %x6 = trunc <16 x i32> %x5 to <16 x i16>
-  ret <16 x i16> %x6
-}
-
-define void @negative_test1_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
-; KNL-LABEL: negative_test1_smax_usat_trunc_wb_256_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: negative_test1_smax_usat_trunc_wb_256_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
-; SKX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; SKX-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
-; SKX-NEXT:    vpmovwb %ymm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-  %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
-  %x3 = icmp slt <16 x i16> %x2, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-  %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-  %x6 = trunc <16 x i16> %x5 to <16 x i8>
-  store <16 x i8> %x6, <16 x i8>* %res, align 1
-  ret void
-}
-
-define void @negative_test2_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
-; KNL-LABEL: negative_test2_smax_usat_trunc_wb_256_mem:
-; KNL:       ## %bb.0:
-; KNL-NEXT:    vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT:    vpmovdb %zmm0, (%rdi)
-; KNL-NEXT:    vzeroupper
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: negative_test2_smax_usat_trunc_wb_256_mem:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
-; SKX-NEXT:    vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; SKX-NEXT:    vpmovwb %ymm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-  %x1 = icmp sgt <16 x i16> %i, <i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10>
-  %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10>
-  %x3 = icmp slt <16 x i16> %x2, <i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5>
-  %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5>
-  %x6 = trunc <16 x i16> %x5 to <16 x i8>
-  store <16 x i8> %x6, <16 x i8>* %res, align 1
-  ret void
-}

Modified: llvm/trunk/test/CodeGen/X86/bswap-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bswap-vector.ll?rev=368079&r1=368078&r2=368079&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bswap-vector.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bswap-vector.ll Tue Aug  6 13:12:20 2019
@@ -2,7 +2,6 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-NOSSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-SSSE3
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-AVX --check-prefix=CHECK-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-WIDE-AVX --check-prefix=CHECK-WIDE-AVX2
 
 declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
 declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
@@ -31,11 +30,6 @@ define <8 x i16> @test1(<8 x i16> %v) {
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: test1:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
   ret <8 x i16> %r
@@ -64,11 +58,6 @@ define <4 x i32> @test2(<4 x i32> %v) {
 ; CHECK-AVX:       # %bb.0:
 ; CHECK-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: test2:
-; CHECK-WIDE-AVX:       # %bb.0:
-; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-WIDE-AVX-NEXT:    retq
   %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
   ret <4 x i32> %r
 }
@@ -99,12 +88,6 @@ define <4 x i32> @or_bswap(<4 x i32> %x,
 ; CHECK-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; CHECK-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: or_bswap:
-; CHECK-WIDE-AVX:       # %bb.0:
-; CHECK-WIDE-AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-WIDE-AVX-NEXT:    retq
   %xt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %x)
   %yt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %y)
   %r = or <4 x i32> %xt, %yt
@@ -136,11 +119,6 @@ define <2 x i64> @test3(<2 x i64> %v) {
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: test3:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
   ret <2 x i64> %r
@@ -183,11 +161,6 @@ define <16 x i16> @test4(<16 x i16> %v)
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: test4:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v)
   ret <16 x i16> %r
@@ -226,11 +199,6 @@ define <8 x i32> @test5(<8 x i32> %v) {
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: test5:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v)
   ret <8 x i32> %r
@@ -273,11 +241,6 @@ define <4 x i64> @test6(<4 x i64> %v) {
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: test6:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v)
   ret <4 x i64> %r
@@ -308,11 +271,6 @@ define <4 x i16> @test7(<4 x i16> %v) {
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: test7:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v)
   ret <4 x i16> %r
@@ -406,11 +364,6 @@ define <8 x i16> @fold_v8i16() {
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v8i16:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> <i16 0, i16 1, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6>)
   ret <8 x i16> %r
@@ -426,11 +379,6 @@ define <4 x i32> @fold_v4i32() {
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v4i32:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> <i32 0, i32 -1, i32 2, i32 -3>)
   ret <4 x i32> %r
@@ -446,11 +394,6 @@ define <2 x i64> @fold_v2i64() {
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v2i64:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> <i64 255, i64 -1>)
   ret <2 x i64> %r
@@ -467,11 +410,6 @@ define <16 x i16> @fold_v16i16() {
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v16i16:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> <i16 0, i16 1, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14>)
   ret <16 x i16> %r
@@ -488,11 +426,6 @@ define <8 x i32> @fold_v8i32() {
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v8i32:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> <i32 0, i32 1, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6>)
   ret <8 x i32> %r
@@ -509,11 +442,6 @@ define <4 x i64> @fold_v4i64() {
 ; CHECK-AVX:       # %bb.0: # %entry
 ; CHECK-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160]
 ; CHECK-AVX-NEXT:    retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v4i64:
-; CHECK-WIDE-AVX:       # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160]
-; CHECK-WIDE-AVX-NEXT:    retq
 entry:
   %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> <i64 255, i64 -1, i64 65535, i64 16776960>)
   ret <4 x i64> %r

Modified: llvm/trunk/test/CodeGen/X86/lower-bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/lower-bitcast.ll?rev=368079&r1=368078&r2=368079&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/lower-bitcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/lower-bitcast.ll Tue Aug  6 13:12:20 2019
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=core2 -mattr=+sse2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=core2 -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
 
 ; FIXME: Ideally we should be able to fold the entire body of @test1 into a
 ; single paddd instruction. At the moment we produce the sequence
@@ -11,11 +10,6 @@ define double @test1(double %A) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
-; CHECK-WIDE-LABEL: test1:
-; CHECK-WIDE:       # %bb.0:
-; CHECK-WIDE-NEXT:    paddd {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT:    retq
   %1 = bitcast double %A to <2 x i32>
   %add = add <2 x i32> %1, <i32 3, i32 5>
   %2 = bitcast <2 x i32> %add to double
@@ -27,11 +21,6 @@ define double @test2(double %A, double %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    paddd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
-; CHECK-WIDE-LABEL: test2:
-; CHECK-WIDE:       # %bb.0:
-; CHECK-WIDE-NEXT:    paddd %xmm1, %xmm0
-; CHECK-WIDE-NEXT:    retq
   %1 = bitcast double %A to <2 x i32>
   %2 = bitcast double %B to <2 x i32>
   %add = add <2 x i32> %1, %2
@@ -46,13 +35,6 @@ define i64 @test3(i64 %A) {
 ; CHECK-NEXT:    addps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    movq %xmm0, %rax
 ; CHECK-NEXT:    retq
-;
-; CHECK-WIDE-LABEL: test3:
-; CHECK-WIDE:       # %bb.0:
-; CHECK-WIDE-NEXT:    movq %rdi, %xmm0
-; CHECK-WIDE-NEXT:    addps {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT:    movq %xmm0, %rax
-; CHECK-WIDE-NEXT:    retq
   %1 = bitcast i64 %A to <2 x float>
   %add = fadd <2 x float> %1, <float 3.0, float 5.0>
   %2 = bitcast <2 x float> %add to i64
@@ -69,13 +51,6 @@ define i64 @test4(i64 %A) {
 ; CHECK-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    movq %xmm0, %rax
 ; CHECK-NEXT:    retq
-;
-; CHECK-WIDE-LABEL: test4:
-; CHECK-WIDE:       # %bb.0:
-; CHECK-WIDE-NEXT:    movq %rdi, %xmm0
-; CHECK-WIDE-NEXT:    paddd {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT:    movq %xmm0, %rax
-; CHECK-WIDE-NEXT:    retq
   %1 = bitcast i64 %A to <2 x i32>
   %add = add <2 x i32> %1, <i32 3, i32 5>
   %2 = bitcast <2 x i32> %add to i64
@@ -87,11 +62,6 @@ define double @test5(double %A) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
-; CHECK-WIDE-LABEL: test5:
-; CHECK-WIDE:       # %bb.0:
-; CHECK-WIDE-NEXT:    addps {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT:    retq
   %1 = bitcast double %A to <2 x float>
   %add = fadd <2 x float> %1, <float 3.0, float 5.0>
   %2 = bitcast <2 x float> %add to double
@@ -106,11 +76,6 @@ define double @test6(double %A) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    paddw {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
-; CHECK-WIDE-LABEL: test6:
-; CHECK-WIDE:       # %bb.0:
-; CHECK-WIDE-NEXT:    paddw {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT:    retq
   %1 = bitcast double %A to <4 x i16>
   %add = add <4 x i16> %1, <i16 3, i16 4, i16 5, i16 6>
   %2 = bitcast <4 x i16> %add to double
@@ -122,11 +87,6 @@ define double @test7(double %A, double %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    paddw %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
-; CHECK-WIDE-LABEL: test7:
-; CHECK-WIDE:       # %bb.0:
-; CHECK-WIDE-NEXT:    paddw %xmm1, %xmm0
-; CHECK-WIDE-NEXT:    retq
   %1 = bitcast double %A to <4 x i16>
   %2 = bitcast double %B to <4 x i16>
   %add = add <4 x i16> %1, %2
@@ -143,11 +103,6 @@ define double @test8(double %A) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    paddb {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
-; CHECK-WIDE-LABEL: test8:
-; CHECK-WIDE:       # %bb.0:
-; CHECK-WIDE-NEXT:    paddb {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT:    retq
   %1 = bitcast double %A to <8 x i8>
   %add = add <8 x i8> %1, <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10>
   %2 = bitcast <8 x i8> %add to double
@@ -159,11 +114,6 @@ define double @test9(double %A, double %
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    paddb %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
-; CHECK-WIDE-LABEL: test9:
-; CHECK-WIDE:       # %bb.0:
-; CHECK-WIDE-NEXT:    paddb %xmm1, %xmm0
-; CHECK-WIDE-NEXT:    retq
   %1 = bitcast double %A to <8 x i8>
   %2 = bitcast double %B to <8 x i8>
   %add = add <8 x i8> %1, %2

Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter_widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter_widen.ll?rev=368079&r1=368078&r2=368079&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter_widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter_widen.ll Tue Aug  6 13:12:20 2019
@@ -1,10 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_SKX
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_KNL
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=CHECK --check-prefix=PROMOTE --check-prefix=PROMOTE_SKX
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=CHECK --check-prefix=PROMOTE --check-prefix=PROMOTE_KNL
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=WIDEN_AVX2
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s --check-prefix=PROMOTE_AVX2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_SKX
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_KNL
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s --check-prefix=WIDEN_AVX2
 
 define <2 x double> @test_gather_v2i32_index(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
 ; WIDEN_SKX-LABEL: test_gather_v2i32_index:
@@ -28,40 +25,12 @@ define <2 x double> @test_gather_v2i32_i
 ; WIDEN_KNL-NEXT:    vzeroupper
 ; WIDEN_KNL-NEXT:    retq
 ;
-; PROMOTE_SKX-LABEL: test_gather_v2i32_index:
-; PROMOTE_SKX:       # %bb.0:
-; PROMOTE_SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
-; PROMOTE_SKX-NEXT:    vpmovq2m %xmm1, %k1
-; PROMOTE_SKX-NEXT:    vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1}
-; PROMOTE_SKX-NEXT:    vmovapd %xmm2, %xmm0
-; PROMOTE_SKX-NEXT:    retq
-;
-; PROMOTE_KNL-LABEL: test_gather_v2i32_index:
-; PROMOTE_KNL:       # %bb.0:
-; PROMOTE_KNL-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
-; PROMOTE_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; PROMOTE_KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
-; PROMOTE_KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
-; PROMOTE_KNL-NEXT:    kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT:    kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT:    vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
-; PROMOTE_KNL-NEXT:    vmovapd %xmm2, %xmm0
-; PROMOTE_KNL-NEXT:    vzeroupper
-; PROMOTE_KNL-NEXT:    retq
-;
 ; WIDEN_AVX2-LABEL: test_gather_v2i32_index:
 ; WIDEN_AVX2:       # %bb.0:
 ; WIDEN_AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
 ; WIDEN_AVX2-NEXT:    vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2
 ; WIDEN_AVX2-NEXT:    vmovapd %xmm2, %xmm0
 ; WIDEN_AVX2-NEXT:    retq
-;
-; PROMOTE_AVX2-LABEL: test_gather_v2i32_index:
-; PROMOTE_AVX2:       # %bb.0:
-; PROMOTE_AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT:    vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2
-; PROMOTE_AVX2-NEXT:    vmovapd %xmm2, %xmm0
-; PROMOTE_AVX2-NEXT:    retq
   %gep.random = getelementptr double, double* %base, <2 x i32> %ind
   %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
   ret <2 x double> %res
@@ -87,25 +56,6 @@ define void @test_scatter_v2i32_index(<2
 ; WIDEN_KNL-NEXT:    vzeroupper
 ; WIDEN_KNL-NEXT:    retq
 ;
-; PROMOTE_SKX-LABEL: test_scatter_v2i32_index:
-; PROMOTE_SKX:       # %bb.0:
-; PROMOTE_SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
-; PROMOTE_SKX-NEXT:    vpmovq2m %xmm2, %k1
-; PROMOTE_SKX-NEXT:    vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1}
-; PROMOTE_SKX-NEXT:    retq
-;
-; PROMOTE_KNL-LABEL: test_scatter_v2i32_index:
-; PROMOTE_KNL:       # %bb.0:
-; PROMOTE_KNL-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
-; PROMOTE_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; PROMOTE_KNL-NEXT:    vpsllq $63, %xmm2, %xmm2
-; PROMOTE_KNL-NEXT:    vptestmq %zmm2, %zmm2, %k0
-; PROMOTE_KNL-NEXT:    kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT:    kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT:    vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1}
-; PROMOTE_KNL-NEXT:    vzeroupper
-; PROMOTE_KNL-NEXT:    retq
-;
 ; WIDEN_AVX2-LABEL: test_scatter_v2i32_index:
 ; WIDEN_AVX2:       # %bb.0:
 ; WIDEN_AVX2-NEXT:    vpmovsxdq %xmm1, %xmm1
@@ -131,32 +81,6 @@ define void @test_scatter_v2i32_index(<2
 ; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
 ; WIDEN_AVX2-NEXT:    vmovhps %xmm0, (%rax)
 ; WIDEN_AVX2-NEXT:    retq
-;
-; PROMOTE_AVX2-LABEL: test_scatter_v2i32_index:
-; PROMOTE_AVX2:       # %bb.0:
-; PROMOTE_AVX2-NEXT:    vpmovsxdq %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT:    vpsllq $3, %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT:    vmovq %rdi, %xmm3
-; PROMOTE_AVX2-NEXT:    vpbroadcastq %xmm3, %xmm3
-; PROMOTE_AVX2-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; PROMOTE_AVX2-NEXT:    vpsllq $63, %xmm2, %xmm2
-; PROMOTE_AVX2-NEXT:    vmovmskpd %xmm2, %eax
-; PROMOTE_AVX2-NEXT:    testb $1, %al
-; PROMOTE_AVX2-NEXT:    jne .LBB1_1
-; PROMOTE_AVX2-NEXT:  # %bb.2: # %else
-; PROMOTE_AVX2-NEXT:    testb $2, %al
-; PROMOTE_AVX2-NEXT:    jne .LBB1_3
-; PROMOTE_AVX2-NEXT:  .LBB1_4: # %else2
-; PROMOTE_AVX2-NEXT:    retq
-; PROMOTE_AVX2-NEXT:  .LBB1_1: # %cond.store
-; PROMOTE_AVX2-NEXT:    vmovq %xmm1, %rcx
-; PROMOTE_AVX2-NEXT:    vmovlps %xmm0, (%rcx)
-; PROMOTE_AVX2-NEXT:    testb $2, %al
-; PROMOTE_AVX2-NEXT:    je .LBB1_4
-; PROMOTE_AVX2-NEXT:  .LBB1_3: # %cond.store1
-; PROMOTE_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; PROMOTE_AVX2-NEXT:    vmovhps %xmm0, (%rax)
-; PROMOTE_AVX2-NEXT:    retq
   %gep = getelementptr double, double *%base, <2 x i32> %ind
   call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask)
   ret void
@@ -184,27 +108,6 @@ define <2 x i32> @test_gather_v2i32_data
 ; WIDEN_KNL-NEXT:    vzeroupper
 ; WIDEN_KNL-NEXT:    retq
 ;
-; PROMOTE_SKX-LABEL: test_gather_v2i32_data:
-; PROMOTE_SKX:       # %bb.0:
-; PROMOTE_SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
-; PROMOTE_SKX-NEXT:    vpmovq2m %xmm1, %k1
-; PROMOTE_SKX-NEXT:    vpgatherqd (,%xmm0), %xmm2 {%k1}
-; PROMOTE_SKX-NEXT:    vmovdqa %xmm2, %xmm0
-; PROMOTE_SKX-NEXT:    retq
-;
-; PROMOTE_KNL-LABEL: test_gather_v2i32_data:
-; PROMOTE_KNL:       # %bb.0:
-; PROMOTE_KNL-NEXT:    # kill: def $xmm2 killed $xmm2 def $ymm2
-; PROMOTE_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; PROMOTE_KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
-; PROMOTE_KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
-; PROMOTE_KNL-NEXT:    kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT:    kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT:    vpgatherqd (,%zmm0), %ymm2 {%k1}
-; PROMOTE_KNL-NEXT:    vmovdqa %xmm2, %xmm0
-; PROMOTE_KNL-NEXT:    vzeroupper
-; PROMOTE_KNL-NEXT:    retq
-;
 ; WIDEN_AVX2-LABEL: test_gather_v2i32_data:
 ; WIDEN_AVX2:       # %bb.0:
 ; WIDEN_AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
@@ -212,14 +115,6 @@ define <2 x i32> @test_gather_v2i32_data
 ; WIDEN_AVX2-NEXT:    vpgatherqd %xmm1, (,%xmm0), %xmm2
 ; WIDEN_AVX2-NEXT:    vmovdqa %xmm2, %xmm0
 ; WIDEN_AVX2-NEXT:    retq
-;
-; PROMOTE_AVX2-LABEL: test_gather_v2i32_data:
-; PROMOTE_AVX2:       # %bb.0:
-; PROMOTE_AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; PROMOTE_AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT:    vpgatherqd %xmm1, (,%xmm0), %xmm2
-; PROMOTE_AVX2-NEXT:    vmovdqa %xmm2, %xmm0
-; PROMOTE_AVX2-NEXT:    retq
   %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptr, i32 4, <2 x i1> %mask, <2 x i32> %src0)
   ret <2 x i32>%res
 }
@@ -244,25 +139,6 @@ define void @test_scatter_v2i32_data(<2
 ; WIDEN_KNL-NEXT:    vzeroupper
 ; WIDEN_KNL-NEXT:    retq
 ;
-; PROMOTE_SKX-LABEL: test_scatter_v2i32_data:
-; PROMOTE_SKX:       # %bb.0:
-; PROMOTE_SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
-; PROMOTE_SKX-NEXT:    vpmovq2m %xmm2, %k1
-; PROMOTE_SKX-NEXT:    vpscatterqd %xmm0, (,%xmm1) {%k1}
-; PROMOTE_SKX-NEXT:    retq
-;
-; PROMOTE_KNL-LABEL: test_scatter_v2i32_data:
-; PROMOTE_KNL:       # %bb.0:
-; PROMOTE_KNL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; PROMOTE_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; PROMOTE_KNL-NEXT:    vpsllq $63, %xmm2, %xmm2
-; PROMOTE_KNL-NEXT:    vptestmq %zmm2, %zmm2, %k0
-; PROMOTE_KNL-NEXT:    kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT:    kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
-; PROMOTE_KNL-NEXT:    vzeroupper
-; PROMOTE_KNL-NEXT:    retq
-;
 ; WIDEN_AVX2-LABEL: test_scatter_v2i32_data:
 ; WIDEN_AVX2:       # %bb.0:
 ; WIDEN_AVX2-NEXT:    vpsllq $63, %xmm2, %xmm2
@@ -283,27 +159,6 @@ define void @test_scatter_v2i32_data(<2
 ; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
 ; WIDEN_AVX2-NEXT:    vextractps $1, %xmm0, (%rax)
 ; WIDEN_AVX2-NEXT:    retq
-;
-; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data:
-; PROMOTE_AVX2:       # %bb.0:
-; PROMOTE_AVX2-NEXT:    vpsllq $63, %xmm2, %xmm2
-; PROMOTE_AVX2-NEXT:    vmovmskpd %xmm2, %eax
-; PROMOTE_AVX2-NEXT:    testb $1, %al
-; PROMOTE_AVX2-NEXT:    jne .LBB3_1
-; PROMOTE_AVX2-NEXT:  # %bb.2: # %else
-; PROMOTE_AVX2-NEXT:    testb $2, %al
-; PROMOTE_AVX2-NEXT:    jne .LBB3_3
-; PROMOTE_AVX2-NEXT:  .LBB3_4: # %else2
-; PROMOTE_AVX2-NEXT:    retq
-; PROMOTE_AVX2-NEXT:  .LBB3_1: # %cond.store
-; PROMOTE_AVX2-NEXT:    vmovq %xmm1, %rcx
-; PROMOTE_AVX2-NEXT:    vmovss %xmm0, (%rcx)
-; PROMOTE_AVX2-NEXT:    testb $2, %al
-; PROMOTE_AVX2-NEXT:    je .LBB3_4
-; PROMOTE_AVX2-NEXT:  .LBB3_3: # %cond.store1
-; PROMOTE_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; PROMOTE_AVX2-NEXT:    vextractps $1, %xmm0, (%rax)
-; PROMOTE_AVX2-NEXT:    retq
   call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
   ret void
 }
@@ -330,27 +185,6 @@ define <2 x i32> @test_gather_v2i32_data
 ; WIDEN_KNL-NEXT:    vzeroupper
 ; WIDEN_KNL-NEXT:    retq
 ;
-; PROMOTE_SKX-LABEL: test_gather_v2i32_data_index:
-; PROMOTE_SKX:       # %bb.0:
-; PROMOTE_SKX-NEXT:    vpsllq $63, %xmm1, %xmm1
-; PROMOTE_SKX-NEXT:    vpmovq2m %xmm1, %k1
-; PROMOTE_SKX-NEXT:    vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1}
-; PROMOTE_SKX-NEXT:    vmovdqa %xmm2, %xmm0
-; PROMOTE_SKX-NEXT:    retq
-;
-; PROMOTE_KNL-LABEL: test_gather_v2i32_data_index:
-; PROMOTE_KNL:       # %bb.0:
-; PROMOTE_KNL-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
-; PROMOTE_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; PROMOTE_KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
-; PROMOTE_KNL-NEXT:    vptestmq %zmm1, %zmm1, %k0
-; PROMOTE_KNL-NEXT:    kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT:    kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT:    vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
-; PROMOTE_KNL-NEXT:    vmovdqa %xmm2, %xmm0
-; PROMOTE_KNL-NEXT:    vzeroupper
-; PROMOTE_KNL-NEXT:    retq
-;
 ; WIDEN_AVX2-LABEL: test_gather_v2i32_data_index:
 ; WIDEN_AVX2:       # %bb.0:
 ; WIDEN_AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
@@ -358,14 +192,6 @@ define <2 x i32> @test_gather_v2i32_data
 ; WIDEN_AVX2-NEXT:    vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2
 ; WIDEN_AVX2-NEXT:    vmovdqa %xmm2, %xmm0
 ; WIDEN_AVX2-NEXT:    retq
-;
-; PROMOTE_AVX2-LABEL: test_gather_v2i32_data_index:
-; PROMOTE_AVX2:       # %bb.0:
-; PROMOTE_AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; PROMOTE_AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT:    vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2
-; PROMOTE_AVX2-NEXT:    vmovdqa %xmm2, %xmm0
-; PROMOTE_AVX2-NEXT:    retq
   %gep.random = getelementptr i32, i32* %base, <2 x i32> %ind
   %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
   ret <2 x i32> %res
@@ -391,25 +217,6 @@ define void @test_scatter_v2i32_data_ind
 ; WIDEN_KNL-NEXT:    vzeroupper
 ; WIDEN_KNL-NEXT:    retq
 ;
-; PROMOTE_SKX-LABEL: test_scatter_v2i32_data_index:
-; PROMOTE_SKX:       # %bb.0:
-; PROMOTE_SKX-NEXT:    vpsllq $63, %xmm2, %xmm2
-; PROMOTE_SKX-NEXT:    vpmovq2m %xmm2, %k1
-; PROMOTE_SKX-NEXT:    vpscatterdd %xmm0, (%rdi,%xmm1,4) {%k1}
-; PROMOTE_SKX-NEXT:    retq
-;
-; PROMOTE_KNL-LABEL: test_scatter_v2i32_data_index:
-; PROMOTE_KNL:       # %bb.0:
-; PROMOTE_KNL-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; PROMOTE_KNL-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; PROMOTE_KNL-NEXT:    vpsllq $63, %xmm2, %xmm2
-; PROMOTE_KNL-NEXT:    vptestmq %zmm2, %zmm2, %k0
-; PROMOTE_KNL-NEXT:    kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT:    kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT:    vpscatterdd %zmm0, (%rdi,%zmm1,4) {%k1}
-; PROMOTE_KNL-NEXT:    vzeroupper
-; PROMOTE_KNL-NEXT:    retq
-;
 ; WIDEN_AVX2-LABEL: test_scatter_v2i32_data_index:
 ; WIDEN_AVX2:       # %bb.0:
 ; WIDEN_AVX2-NEXT:    vpmovsxdq %xmm1, %xmm1
@@ -435,32 +242,6 @@ define void @test_scatter_v2i32_data_ind
 ; WIDEN_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
 ; WIDEN_AVX2-NEXT:    vextractps $1, %xmm0, (%rax)
 ; WIDEN_AVX2-NEXT:    retq
-;
-; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data_index:
-; PROMOTE_AVX2:       # %bb.0:
-; PROMOTE_AVX2-NEXT:    vpmovsxdq %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT:    vpsllq $2, %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT:    vmovq %rdi, %xmm3
-; PROMOTE_AVX2-NEXT:    vpbroadcastq %xmm3, %xmm3
-; PROMOTE_AVX2-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; PROMOTE_AVX2-NEXT:    vpsllq $63, %xmm2, %xmm2
-; PROMOTE_AVX2-NEXT:    vmovmskpd %xmm2, %eax
-; PROMOTE_AVX2-NEXT:    testb $1, %al
-; PROMOTE_AVX2-NEXT:    jne .LBB5_1
-; PROMOTE_AVX2-NEXT:  # %bb.2: # %else
-; PROMOTE_AVX2-NEXT:    testb $2, %al
-; PROMOTE_AVX2-NEXT:    jne .LBB5_3
-; PROMOTE_AVX2-NEXT:  .LBB5_4: # %else2
-; PROMOTE_AVX2-NEXT:    retq
-; PROMOTE_AVX2-NEXT:  .LBB5_1: # %cond.store
-; PROMOTE_AVX2-NEXT:    vmovq %xmm1, %rcx
-; PROMOTE_AVX2-NEXT:    vmovss %xmm0, (%rcx)
-; PROMOTE_AVX2-NEXT:    testb $2, %al
-; PROMOTE_AVX2-NEXT:    je .LBB5_4
-; PROMOTE_AVX2-NEXT:  .LBB5_3: # %cond.store1
-; PROMOTE_AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; PROMOTE_AVX2-NEXT:    vextractps $1, %xmm0, (%rax)
-; PROMOTE_AVX2-NEXT:    retq
   %gep = getelementptr i32, i32 *%base, <2 x i32> %ind
   call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %gep, i32 4, <2 x i1> %mask)
   ret void

Modified: llvm/trunk/test/CodeGen/X86/pmulh.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pmulh.ll?rev=368079&r1=368078&r2=368079&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pmulh.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pmulh.ll Tue Aug  6 13:12:20 2019
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-PROMOTE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-WIDEN
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-PROMOTE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-WIDEN
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW

Removed: llvm/trunk/test/CodeGen/X86/shrink_vmul-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shrink_vmul-widen.ll?rev=368078&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shrink_vmul-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shrink_vmul-widen.ll (removed)
@@ -1,2553 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
-
- at c = external global i32*, align 8
-
-; %val1 = load <2 x i8>
-; %op1 = zext<2 x i32> %val1
-; %val2 = load <2 x i8>
-; %op2 = zext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi8:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
-; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-SSE-NEXT:    movd %eax, %xmm1
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi8:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
-; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT:    vmovd %edx, %xmm0
-; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT:    vmovd %eax, %xmm1
-; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi8:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm1
-; X64-SSE-NEXT:    pxor %xmm2, %xmm2
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi8:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm0
-; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm1
-; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
-  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
-  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
-  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
-  %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <4 x i8>
-; %op1 = zext<4 x i32> %val1
-; %val2 = load <4 x i8>
-; %op2 = zext<4 x i32> %val2
-; %rst = mul <4 x i32> %op1, %op2
-;
-define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_4xi8:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_4xi8:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
-; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_4xi8:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    pxor %xmm2, %xmm2
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_4xi8:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vmovdqu %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
-  %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
-  %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
-  %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
-  %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
-  %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
-  store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <8 x i8>
-; %op1 = zext<8 x i32> %val1
-; %val2 = load <8 x i8>
-; %op2 = zext<8 x i32> %val2
-; %rst = mul <8 x i32> %op1, %op2
-;
-define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_8xi8:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX1-LABEL: mul_8xi8:
-; X86-AVX1:       # %bb.0: # %entry
-; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    movl c, %esi
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vmovdqu %xmm0, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, (%esi,%ecx,4)
-; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    retl
-;
-; X86-AVX2-LABEL: mul_8xi8:
-; X86-AVX2:       # %bb.0: # %entry
-; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    movl c, %esi
-; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
-; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    vzeroupper
-; X86-AVX2-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_8xi8:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE-NEXT:    pxor %xmm2, %xmm2
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
-; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: mul_8xi8:
-; X64-AVX1:       # %bb.0: # %entry
-; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT:    vmovdqu %xmm0, 16(%rax,%rdx,4)
-; X64-AVX1-NEXT:    vmovdqu %xmm1, (%rax,%rdx,4)
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: mul_8xi8:
-; X64-AVX2:       # %bb.0: # %entry
-; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
-; X64-AVX2-NEXT:    vmovdqu %ymm0, (%rax,%rdx,4)
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
-  %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
-  %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
-  %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
-  %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
-  %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
-  store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <16 x i8>
-; %op1 = zext<16 x i32> %val1
-; %val2 = load <16 x i8>
-; %op2 = zext<16 x i32> %val2
-; %rst = mul <16 x i32> %op1, %op2
-;
-define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_16xi8:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm1
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X86-SSE-NEXT:    pmullw %xmm3, %xmm4
-; X86-SSE-NEXT:    movdqa %xmm4, %xmm3
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    movdqu %xmm1, 48(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm0, 32(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm4, 16(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm3, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX1-LABEL: mul_16xi8:
-; X86-AVX1:       # %bb.0: # %entry
-; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    movl c, %esi
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm3, (%esi,%ecx,4)
-; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    retl
-;
-; X86-AVX2-LABEL: mul_16xi8:
-; X86-AVX2:       # %bb.0: # %entry
-; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    movl c, %esi
-; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
-; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
-; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    vzeroupper
-; X86-AVX2-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_16xi8:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
-; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm1
-; X64-SSE-NEXT:    pxor %xmm2, %xmm2
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm4
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X64-SSE-NEXT:    pmullw %xmm3, %xmm4
-; X64-SSE-NEXT:    movdqa %xmm4, %xmm3
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X64-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; X64-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
-; X64-SSE-NEXT:    movdqu %xmm0, 32(%rax,%rdx,4)
-; X64-SSE-NEXT:    movdqu %xmm4, 16(%rax,%rdx,4)
-; X64-SSE-NEXT:    movdqu %xmm3, (%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: mul_16xi8:
-; X64-AVX1:       # %bb.0: # %entry
-; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
-; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
-; X64-AVX1-NEXT:    vmovdqu %xmm0, 48(%rax,%rdx,4)
-; X64-AVX1-NEXT:    vmovdqu %xmm1, 32(%rax,%rdx,4)
-; X64-AVX1-NEXT:    vmovdqu %xmm2, 16(%rax,%rdx,4)
-; X64-AVX1-NEXT:    vmovdqu %xmm3, (%rax,%rdx,4)
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: mul_16xi8:
-; X64-AVX2:       # %bb.0: # %entry
-; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
-; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
-; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
-; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
-  %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
-  %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
-  %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
-  %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
-  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
-  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <2 x i16>
-; %op1 = zext<2 x i32> %val1
-; %val2 = load <2 x i16>
-; %op2 = zext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi16:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi16:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi16:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
-; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi16:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
-  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
-  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
-  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
-  %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <4 x i16>
-; %op1 = zext<4 x i32> %val1
-; %val2 = load <4 x i16>
-; %op2 = zext<4 x i32> %val2
-; %rst = mul <4 x i32> %op1, %op2
-;
-define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_4xi16:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_4xi16:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
-; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_4xi16:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
-; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_4xi16:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vmovdqu %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
-  %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
-  %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
-  %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
-  %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
-  %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
-  store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <8 x i16>
-; %op1 = zext<8 x i32> %val1
-; %val2 = load <8 x i16>
-; %op2 = zext<8 x i32> %val2
-; %rst = mul <8 x i32> %op1, %op2
-;
-define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_8xi16:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX1-LABEL: mul_8xi16:
-; X86-AVX1:       # %bb.0: # %entry
-; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    movl c, %esi
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vmovdqu %xmm0, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, (%esi,%ecx,4)
-; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    retl
-;
-; X86-AVX2-LABEL: mul_8xi16:
-; X86-AVX2:       # %bb.0: # %entry
-; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    movl c, %esi
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
-; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    vzeroupper
-; X86-AVX2-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_8xi16:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
-; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm1
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
-; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
-; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: mul_8xi16:
-; X64-AVX1:       # %bb.0: # %entry
-; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT:    vmovdqu %xmm0, 16(%rax,%rdx,4)
-; X64-AVX1-NEXT:    vmovdqu %xmm1, (%rax,%rdx,4)
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: mul_8xi16:
-; X64-AVX2:       # %bb.0: # %entry
-; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
-; X64-AVX2-NEXT:    vmovdqu %ymm0, (%rax,%rdx,4)
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
-  %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
-  %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
-  %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
-  %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
-  %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
-  store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <16 x i16>
-; %op1 = zext<16 x i32> %val1
-; %val2 = load <16 x i16>
-; %op2 = zext<16 x i32> %val2
-; %rst = mul <16 x i32> %op1, %op2
-;
-define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_16xi16:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT:    movdqu 16(%edx,%ecx), %xmm1
-; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm2
-; X86-SSE-NEXT:    movdqu 16(%eax,%ecx), %xmm3
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm1, %xmm3
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-SSE-NEXT:    movdqu %xmm3, 32(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm1, 48(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm2, (%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm0, 16(%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX1-LABEL: mul_16xi16:
-; X86-AVX1:       # %bb.0: # %entry
-; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    movl c, %esi
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm3, (%esi,%ecx,4)
-; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    retl
-;
-; X86-AVX2-LABEL: mul_16xi16:
-; X86-AVX2:       # %bb.0: # %entry
-; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    movl c, %esi
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
-; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
-; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    vzeroupper
-; X86-AVX2-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_16xi16:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
-; X64-SSE-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
-; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm2
-; X64-SSE-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
-; X64-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm4
-; X64-SSE-NEXT:    pmullw %xmm0, %xmm2
-; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm4
-; X64-SSE-NEXT:    pmullw %xmm1, %xmm3
-; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X64-SSE-NEXT:    movdqu %xmm3, 32(%rax,%rdx,4)
-; X64-SSE-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
-; X64-SSE-NEXT:    movdqu %xmm2, (%rax,%rdx,4)
-; X64-SSE-NEXT:    movdqu %xmm0, 16(%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: mul_16xi16:
-; X64-AVX1:       # %bb.0: # %entry
-; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
-; X64-AVX1-NEXT:    vmovdqu %xmm0, 48(%rax,%rdx,4)
-; X64-AVX1-NEXT:    vmovdqu %xmm1, 32(%rax,%rdx,4)
-; X64-AVX1-NEXT:    vmovdqu %xmm2, 16(%rax,%rdx,4)
-; X64-AVX1-NEXT:    vmovdqu %xmm3, (%rax,%rdx,4)
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: mul_16xi16:
-; X64-AVX2:       # %bb.0: # %entry
-; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
-; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
-; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
-; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
-  %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
-  %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
-  %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
-  %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
-  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
-  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <2 x i8>
-; %op1 = sext<2 x i32> %val1
-; %val2 = load <2 x i8>
-; %op2 = sext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi8_sext:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
-; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-SSE-NEXT:    movd %eax, %xmm1
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT:    psraw $8, %xmm0
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT:    psraw $8, %xmm1
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT:    psrad $16, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi8_sext:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
-; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT:    vmovd %edx, %xmm0
-; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT:    vmovd %eax, %xmm1
-; X86-AVX-NEXT:    vpmovsxbd %xmm1, %xmm1
-; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi8_sext:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm1
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT:    psraw $8, %xmm0
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT:    psraw $8, %xmm1
-; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT:    psrad $16, %xmm0
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi8_sext:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm0
-; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm1
-; X64-AVX-NEXT:    vpmovsxbd %xmm1, %xmm1
-; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
-  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
-  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
-  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
-  %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <2 x i8>
-; %op1 = sext<2 x i32> %val1
-; %val2 = load <2 x i8>
-; %op2 = zext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi8_sext_zext:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
-; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-SSE-NEXT:    movd %eax, %xmm1
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT:    psraw $8, %xmm0
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE-NEXT:    pmulhw %xmm0, %xmm2
-; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi8_sext_zext:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
-; X86-AVX-NEXT:    movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT:    vmovd %edx, %xmm0
-; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT:    movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT:    vmovd %eax, %xmm1
-; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi8_sext_zext:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm1
-; X64-SSE-NEXT:    pxor %xmm2, %xmm2
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT:    psraw $8, %xmm0
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE-NEXT:    pmulhw %xmm0, %xmm2
-; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi8_sext_zext:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    movzwl (%rdi,%rdx), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm0
-; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT:    movzwl (%rsi,%rdx), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm1
-; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
-  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
-  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
-  %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
-  %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <2 x i16>
-; %op1 = sext<2 x i32> %val1
-; %val2 = load <2 x i16>
-; %op2 = sext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi16_sext:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE-NEXT:    pmulhw %xmm0, %xmm2
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi16_sext:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpmovsxwd %xmm1, %xmm1
-; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi16_sext:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE-NEXT:    pmulhw %xmm0, %xmm2
-; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi16_sext:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-AVX-NEXT:    vpmovsxwd %xmm1, %xmm1
-; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
-  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
-  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
-  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
-  %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <2 x i16>
-; %op1 = sext<2 x i32> %val1
-; %val2 = load <2 x i16>
-; %op2 = zext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi16_sext_zext:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X86-SSE-NEXT:    psrad $16, %xmm0
-; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    pxor %xmm2, %xmm2
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm0
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi16_sext_zext:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    movl c, %esi
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi16_sext_zext:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X64-SSE-NEXT:    psrad $16, %xmm0
-; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    pxor %xmm2, %xmm2
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-SSE-NEXT:    pmuludq %xmm2, %xmm0
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi16_sext_zext:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
-  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
-  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
-  %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
-  %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val1 = load <16 x i16>
-; %op1 = sext<16 x i32> %val1
-; %val2 = load <16 x i16>
-; %op2 = sext<16 x i32> %val2
-; %rst = mul <16 x i32> %op1, %op2
-;
-define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_16xi16_sext:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT:    movl c, %esi
-; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT:    movdqu 16(%edx,%ecx), %xmm1
-; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm2
-; X86-SSE-NEXT:    movdqu 16(%eax,%ecx), %xmm3
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE-NEXT:    pmulhw %xmm0, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE-NEXT:    pmulhw %xmm1, %xmm4
-; X86-SSE-NEXT:    pmullw %xmm1, %xmm3
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-SSE-NEXT:    movdqu %xmm3, 32(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm1, 48(%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm2, (%esi,%ecx,4)
-; X86-SSE-NEXT:    movdqu %xmm0, 16(%esi,%ecx,4)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX1-LABEL: mul_16xi16_sext:
-; X86-AVX1:       # %bb.0: # %entry
-; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT:    movl c, %esi
-; X86-AVX1-NEXT:    vpmovsxwd 24(%edx,%ecx), %xmm0
-; X86-AVX1-NEXT:    vpmovsxwd 16(%edx,%ecx), %xmm1
-; X86-AVX1-NEXT:    vpmovsxwd 8(%edx,%ecx), %xmm2
-; X86-AVX1-NEXT:    vpmovsxwd (%edx,%ecx), %xmm3
-; X86-AVX1-NEXT:    vpmovsxwd 24(%eax,%ecx), %xmm4
-; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
-; X86-AVX1-NEXT:    vpmovsxwd 16(%eax,%ecx), %xmm4
-; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vpmovsxwd 8(%eax,%ecx), %xmm4
-; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT:    vpmovsxwd (%eax,%ecx), %xmm4
-; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT:    vmovdqu %xmm0, 48(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm1, 32(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm2, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT:    vmovdqu %xmm3, (%esi,%ecx,4)
-; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    retl
-;
-; X86-AVX2-LABEL: mul_16xi16_sext:
-; X86-AVX2:       # %bb.0: # %entry
-; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT:    movl c, %esi
-; X86-AVX2-NEXT:    vpmovsxwd 16(%edx,%ecx), %ymm0
-; X86-AVX2-NEXT:    vpmovsxwd (%edx,%ecx), %ymm1
-; X86-AVX2-NEXT:    vpmovsxwd 16(%eax,%ecx), %ymm2
-; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vpmovsxwd (%eax,%ecx), %ymm2
-; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
-; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
-; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
-; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    vzeroupper
-; X86-AVX2-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_16xi16_sext:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
-; X64-SSE-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
-; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm2
-; X64-SSE-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
-; X64-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X64-SSE-NEXT:    pmulhw %xmm0, %xmm4
-; X64-SSE-NEXT:    pmullw %xmm0, %xmm2
-; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X64-SSE-NEXT:    pmulhw %xmm1, %xmm4
-; X64-SSE-NEXT:    pmullw %xmm1, %xmm3
-; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X64-SSE-NEXT:    movdqu %xmm3, 32(%rax,%rdx,4)
-; X64-SSE-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
-; X64-SSE-NEXT:    movdqu %xmm2, (%rax,%rdx,4)
-; X64-SSE-NEXT:    movdqu %xmm0, 16(%rax,%rdx,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: mul_16xi16_sext:
-; X64-AVX1:       # %bb.0: # %entry
-; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX1-NEXT:    vpmovsxwd 24(%rdi,%rdx), %xmm0
-; X64-AVX1-NEXT:    vpmovsxwd 16(%rdi,%rdx), %xmm1
-; X64-AVX1-NEXT:    vpmovsxwd 8(%rdi,%rdx), %xmm2
-; X64-AVX1-NEXT:    vpmovsxwd (%rdi,%rdx), %xmm3
-; X64-AVX1-NEXT:    vpmovsxwd 24(%rsi,%rdx), %xmm4
-; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
-; X64-AVX1-NEXT:    vpmovsxwd 16(%rsi,%rdx), %xmm4
-; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
-; X64-AVX1-NEXT:    vpmovsxwd 8(%rsi,%rdx), %xmm4
-; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
-; X64-AVX1-NEXT:    vpmovsxwd (%rsi,%rdx), %xmm4
-; X64-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
-; X64-AVX1-NEXT:    vmovdqu %xmm0, 48(%rax,%rdx,4)
-; X64-AVX1-NEXT:    vmovdqu %xmm1, 32(%rax,%rdx,4)
-; X64-AVX1-NEXT:    vmovdqu %xmm2, 16(%rax,%rdx,4)
-; X64-AVX1-NEXT:    vmovdqu %xmm3, (%rax,%rdx,4)
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: mul_16xi16_sext:
-; X64-AVX2:       # %bb.0: # %entry
-; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX2-NEXT:    vpmovsxwd 16(%rdi,%rdx), %ymm0
-; X64-AVX2-NEXT:    vpmovsxwd (%rdi,%rdx), %ymm1
-; X64-AVX2-NEXT:    vpmovsxwd 16(%rsi,%rdx), %ymm2
-; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
-; X64-AVX2-NEXT:    vpmovsxwd (%rsi,%rdx), %ymm2
-; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
-; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
-; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
-  %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
-  %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
-  %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
-  %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
-  %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
-  %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
-  %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
-  store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = zext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst1:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
-; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT:    movd %ecx, %xmm0
-; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst1:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
-; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm0
-; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst1:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pxor %xmm1, %xmm1
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst1:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm0
-; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
-  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
-  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = sext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst2:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
-; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT:    movd %ecx, %xmm0
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT:    psraw $8, %xmm0
-; X86-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X86-SSE-NEXT:    psrad $16, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst2:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
-; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm0
-; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst2:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT:    psraw $8, %xmm0
-; X64-SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X64-SSE-NEXT:    psrad $16, %xmm0
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst2:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm0
-; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
-  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
-  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = zext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst3:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
-; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT:    movd %ecx, %xmm0
-; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst3:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
-; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm0
-; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst3:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pxor %xmm1, %xmm1
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst3:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm0
-; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
-  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
-  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = zext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst4:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
-; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT:    movd %ecx, %xmm0
-; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
-; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst4:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
-; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm0
-; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst4:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    pxor %xmm1, %xmm1
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
-; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst4:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm0
-; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
-  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
-  %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = sext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst5:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
-; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT:    movd %ecx, %xmm0
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT:    psraw $8, %xmm0
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
-; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst5:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
-; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm0
-; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst5:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT:    psraw $8, %xmm0
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
-; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst5:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm0
-; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
-  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
-  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = sext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst6:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
-; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT:    movd %ecx, %xmm0
-; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT:    psraw $8, %xmm0
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
-; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst6:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
-; X86-AVX-NEXT:    movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT:    vmovd %ecx, %xmm0
-; X86-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst6:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT:    movd %ecx, %xmm0
-; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT:    psraw $8, %xmm0
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
-; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst6:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT:    vmovd %ecx, %xmm0
-; X64-AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
-  %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
-  %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val = load <2 x i16>
-; %op1 = zext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi16_varconst1:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
-; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm2
-; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi16_varconst1:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi16_varconst1:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm2
-; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi16_varconst1:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
-  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
-  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val = load <2 x i16>
-; %op1 = sext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi16_varconst2:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
-; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
-; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi16_varconst2:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi16_varconst2:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
-; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
-; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi16_varconst2:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
-  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
-  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val = load <2 x i16>
-; %op1 = zext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi16_varconst3:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
-; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    pxor %xmm1, %xmm1
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65536,u,u>
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi16_varconst3:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi16_varconst3:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    pxor %xmm1, %xmm1
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65536,u,u>
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-SSE-NEXT:    pmuludq %xmm2, %xmm1
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi16_varconst3:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
-  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
-  %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-; %val = load <2 x i16>
-; %op1 = sext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi16_varconst4:
-; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl c, %edx
-; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X86-SSE-NEXT:    psrad $16, %xmm0
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,32768,u,u>
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE-NEXT:    pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX-LABEL: mul_2xi16_varconst4:
-; X86-AVX:       # %bb.0: # %entry
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl c, %edx
-; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT:    retl
-;
-; X64-SSE-LABEL: mul_2xi16_varconst4:
-; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X64-SSE-NEXT:    psrad $16, %xmm0
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,32768,u,u>
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-SSE-NEXT:    pmuludq %xmm2, %xmm1
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX-LABEL: mul_2xi16_varconst4:
-; X64-AVX:       # %bb.0: # %entry
-; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT:    retq
-entry:
-  %pre = load i32*, i32** @c
-  %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
-  %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
-  %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
-  %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
-  %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
-  %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
-  %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
-  store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
-  ret void
-}
-
-;
-; Illegal Types
-;
-
-define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
-; X86-SSE-LABEL: PR34947:
-; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    pushl %esi
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    movdqa (%eax), %xmm5
-; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movdqa (%ecx), %xmm2
-; X86-SSE-NEXT:    movdqa 16(%ecx), %xmm6
-; X86-SSE-NEXT:    pxor %xmm0, %xmm0
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-SSE-NEXT:    movdqa %xmm5, %xmm4
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
-; X86-SSE-NEXT:    movd %xmm0, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
-; X86-SSE-NEXT:    movd %xmm0, %esi
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %esi
-; X86-SSE-NEXT:    movd %edx, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
-; X86-SSE-NEXT:    movd %xmm3, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,0,1]
-; X86-SSE-NEXT:    movd %xmm3, %esi
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %esi
-; X86-SSE-NEXT:    movd %edx, %xmm7
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; X86-SSE-NEXT:    movd %xmm5, %eax
-; X86-SSE-NEXT:    movd %xmm6, %esi
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %esi
-; X86-SSE-NEXT:    movd %edx, %xmm3
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
-; X86-SSE-NEXT:    movd %xmm5, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
-; X86-SSE-NEXT:    movd %xmm5, %esi
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %esi
-; X86-SSE-NEXT:    movd %edx, %xmm5
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
-; X86-SSE-NEXT:    movd %xmm6, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
-; X86-SSE-NEXT:    movd %xmm6, %esi
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %esi
-; X86-SSE-NEXT:    movd %edx, %xmm6
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
-; X86-SSE-NEXT:    movd %xmm7, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
-; X86-SSE-NEXT:    movd %xmm7, %esi
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %esi
-; X86-SSE-NEXT:    movd %edx, %xmm7
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; X86-SSE-NEXT:    movd %xmm4, %eax
-; X86-SSE-NEXT:    movd %xmm2, %esi
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %esi
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; X86-SSE-NEXT:    movd %xmm4, %eax
-; X86-SSE-NEXT:    movd %edx, %xmm4
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; X86-SSE-NEXT:    movd %xmm2, %esi
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl %esi
-; X86-SSE-NEXT:    movd %edx, %xmm2
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
-; X86-SSE-NEXT:    movd %xmm1, %eax
-; X86-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
-; X86-SSE-NEXT:    pmuludq %xmm1, %xmm4
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; X86-SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,0],xmm0[0,0]
-; X86-SSE-NEXT:    pmuludq %xmm1, %xmm3
-; X86-SSE-NEXT:    pmuludq %xmm1, %xmm5
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE-NEXT:    xorl %edx, %edx
-; X86-SSE-NEXT:    divl 32(%ecx)
-; X86-SSE-NEXT:    movdqa %xmm0, (%eax)
-; X86-SSE-NEXT:    movdqa %xmm4, (%eax)
-; X86-SSE-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
-; X86-SSE-NEXT:    movl %eax, (%eax)
-; X86-SSE-NEXT:    popl %esi
-; X86-SSE-NEXT:    retl
-;
-; X86-AVX1-LABEL: PR34947:
-; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    pushl %ebp
-; X86-AVX1-NEXT:    pushl %ebx
-; X86-AVX1-NEXT:    pushl %edi
-; X86-AVX1-NEXT:    pushl %esi
-; X86-AVX1-NEXT:    subl $16, %esp
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT:    vmovd %xmm1, %eax
-; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl 32(%ecx)
-; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $3, %xmm2, %eax
-; X86-AVX1-NEXT:    vmovdqa (%ecx), %xmm1
-; X86-AVX1-NEXT:    vmovdqa 16(%ecx), %xmm3
-; X86-AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
-; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $2, %xmm2, %eax
-; X86-AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
-; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT:    vpextrd $1, %xmm2, %eax
-; X86-AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
-; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-AVX1-NEXT:    vmovd %xmm2, %eax
-; X86-AVX1-NEXT:    vmovd %xmm3, %ecx
-; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, %ebp
-; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; X86-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    movl %edx, %ebx
-; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; X86-AVX1-NEXT:    vpextrd $2, %xmm1, %esi
-; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl %esi
-; X86-AVX1-NEXT:    movl %edx, %esi
-; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; X86-AVX1-NEXT:    vpextrd $1, %xmm1, %edi
-; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl %edi
-; X86-AVX1-NEXT:    movl %edx, %edi
-; X86-AVX1-NEXT:    vmovd %xmm0, %eax
-; X86-AVX1-NEXT:    vmovd %xmm1, %ecx
-; X86-AVX1-NEXT:    xorl %edx, %edx
-; X86-AVX1-NEXT:    divl %ecx
-; X86-AVX1-NEXT:    vmovd %edx, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpinsrd $3, %ebx, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vmovd %ebp, %xmm1
-; X86-AVX1-NEXT:    vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT:    vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT:    vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT:    imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-AVX1-NEXT:    # imm = 0x2007
-; X86-AVX1-NEXT:    movl %eax, (%eax)
-; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
-; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vmovdqa %xmm1, (%eax)
-; X86-AVX1-NEXT:    vmovdqa %xmm0, (%eax)
-; X86-AVX1-NEXT:    addl $16, %esp
-; X86-AVX1-NEXT:    popl %esi
-; X86-AVX1-NEXT:    popl %edi
-; X86-AVX1-NEXT:    popl %ebx
-; X86-AVX1-NEXT:    popl %ebp
-; X86-AVX1-NEXT:    retl
-;
-; X86-AVX2-LABEL: PR34947:
-; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    pushl %edi
-; X86-AVX2-NEXT:    pushl %esi
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT:    vmovdqa (%esi), %xmm2
-; X86-AVX2-NEXT:    vmovdqa 16(%esi), %xmm3
-; X86-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
-; X86-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; X86-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
-; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl %ecx
-; X86-AVX2-NEXT:    movl %edx, %ecx
-; X86-AVX2-NEXT:    vmovd %xmm3, %edi
-; X86-AVX2-NEXT:    vmovd %xmm4, %eax
-; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl %edi
-; X86-AVX2-NEXT:    vmovd %edx, %xmm5
-; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
-; X86-AVX2-NEXT:    vpextrd $2, %xmm3, %ecx
-; X86-AVX2-NEXT:    vpextrd $2, %xmm4, %eax
-; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl %ecx
-; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
-; X86-AVX2-NEXT:    vpextrd $3, %xmm3, %ecx
-; X86-AVX2-NEXT:    vpextrd $3, %xmm4, %eax
-; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl %ecx
-; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
-; X86-AVX2-NEXT:    vpextrd $1, %xmm2, %ecx
-; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
-; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl %ecx
-; X86-AVX2-NEXT:    movl %edx, %ecx
-; X86-AVX2-NEXT:    vmovd %xmm2, %edi
-; X86-AVX2-NEXT:    vmovd %xmm1, %eax
-; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl %edi
-; X86-AVX2-NEXT:    vmovd %edx, %xmm4
-; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
-; X86-AVX2-NEXT:    vpextrd $2, %xmm2, %ecx
-; X86-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
-; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl %ecx
-; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
-; X86-AVX2-NEXT:    vpextrd $3, %xmm2, %ecx
-; X86-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
-; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl %ecx
-; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
-; X86-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; X86-AVX2-NEXT:    vmovd %xmm0, %eax
-; X86-AVX2-NEXT:    xorl %edx, %edx
-; X86-AVX2-NEXT:    divl 32(%esi)
-; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
-; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
-; X86-AVX2-NEXT:    movl %eax, (%eax)
-; X86-AVX2-NEXT:    vmovdqa %ymm0, (%eax)
-; X86-AVX2-NEXT:    popl %esi
-; X86-AVX2-NEXT:    popl %edi
-; X86-AVX2-NEXT:    vzeroupper
-; X86-AVX2-NEXT:    retl
-;
-; X64-SSE-LABEL: PR34947:
-; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movdqa (%rdi), %xmm5
-; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    movdqa (%rsi), %xmm2
-; X64-SSE-NEXT:    movdqa 16(%rsi), %xmm6
-; X64-SSE-NEXT:    pxor %xmm0, %xmm0
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-SSE-NEXT:    movdqa %xmm5, %xmm3
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
-; X64-SSE-NEXT:    movd %xmm0, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
-; X64-SSE-NEXT:    movd %xmm0, %ecx
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm8
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
-; X64-SSE-NEXT:    movd %xmm4, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[2,3,0,1]
-; X64-SSE-NEXT:    movd %xmm4, %ecx
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm7
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; X64-SSE-NEXT:    movd %xmm5, %eax
-; X64-SSE-NEXT:    movd %xmm6, %ecx
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm4
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
-; X64-SSE-NEXT:    movd %xmm5, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
-; X64-SSE-NEXT:    movd %xmm5, %ecx
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm5
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[3,1,2,3]
-; X64-SSE-NEXT:    movd %xmm6, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
-; X64-SSE-NEXT:    movd %xmm6, %ecx
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm6
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
-; X64-SSE-NEXT:    movd %xmm7, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
-; X64-SSE-NEXT:    movd %xmm7, %ecx
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm7
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; X64-SSE-NEXT:    movd %xmm3, %eax
-; X64-SSE-NEXT:    movd %xmm2, %ecx
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm0
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; X64-SSE-NEXT:    movd %xmm3, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; X64-SSE-NEXT:    movd %xmm2, %ecx
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl %ecx
-; X64-SSE-NEXT:    movd %edx, %xmm2
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
-; X64-SSE-NEXT:    movd %xmm1, %eax
-; X64-SSE-NEXT:    xorl %edx, %edx
-; X64-SSE-NEXT:    divl 32(%rsi)
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
-; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
-; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-SSE-NEXT:    pmuludq %xmm1, %xmm4
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; X64-SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[0,0]
-; X64-SSE-NEXT:    pmuludq %xmm1, %xmm5
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-SSE-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
-; X64-SSE-NEXT:    movl %eax, (%rax)
-; X64-SSE-NEXT:    movdqa %xmm2, (%rax)
-; X64-SSE-NEXT:    movdqa %xmm0, (%rax)
-; X64-SSE-NEXT:    retq
-;
-; X64-AVX1-LABEL: PR34947:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    pushq %rbp
-; X64-AVX1-NEXT:    pushq %rbx
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT:    vmovd %xmm1, %eax
-; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl 32(%rsi)
-; X64-AVX1-NEXT:    movl %edx, %r8d
-; X64-AVX1-NEXT:    vpextrd $3, %xmm2, %eax
-; X64-AVX1-NEXT:    vmovdqa (%rsi), %xmm1
-; X64-AVX1-NEXT:    vmovdqa 16(%rsi), %xmm3
-; X64-AVX1-NEXT:    vpextrd $3, %xmm3, %ecx
-; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl %ecx
-; X64-AVX1-NEXT:    movl %edx, %r9d
-; X64-AVX1-NEXT:    vpextrd $2, %xmm2, %eax
-; X64-AVX1-NEXT:    vpextrd $2, %xmm3, %ecx
-; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl %ecx
-; X64-AVX1-NEXT:    movl %edx, %r10d
-; X64-AVX1-NEXT:    vpextrd $1, %xmm2, %eax
-; X64-AVX1-NEXT:    vpextrd $1, %xmm3, %ecx
-; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl %ecx
-; X64-AVX1-NEXT:    movl %edx, %r11d
-; X64-AVX1-NEXT:    vmovd %xmm2, %eax
-; X64-AVX1-NEXT:    vmovd %xmm3, %ecx
-; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl %ecx
-; X64-AVX1-NEXT:    movl %edx, %esi
-; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %eax
-; X64-AVX1-NEXT:    vpextrd $3, %xmm1, %ecx
-; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl %ecx
-; X64-AVX1-NEXT:    movl %edx, %edi
-; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %eax
-; X64-AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
-; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl %ecx
-; X64-AVX1-NEXT:    movl %edx, %ecx
-; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %eax
-; X64-AVX1-NEXT:    vpextrd $1, %xmm1, %ebx
-; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl %ebx
-; X64-AVX1-NEXT:    movl %edx, %ebx
-; X64-AVX1-NEXT:    vmovd %xmm0, %eax
-; X64-AVX1-NEXT:    vmovd %xmm1, %ebp
-; X64-AVX1-NEXT:    xorl %edx, %edx
-; X64-AVX1-NEXT:    divl %ebp
-; X64-AVX1-NEXT:    vmovd %edx, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
-; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vmovd %esi, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm2, %xmm2
-; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT:    imull $8199, %r8d, %eax # imm = 0x2007
-; X64-AVX1-NEXT:    movl %eax, (%rax)
-; X64-AVX1-NEXT:    vmovdqa %xmm1, (%rax)
-; X64-AVX1-NEXT:    vmovdqa %xmm0, (%rax)
-; X64-AVX1-NEXT:    popq %rbx
-; X64-AVX1-NEXT:    popq %rbp
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: PR34947:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT:    vmovdqa (%rsi), %xmm2
-; X64-AVX2-NEXT:    vmovdqa 16(%rsi), %xmm3
-; X64-AVX2-NEXT:    vpextrd $1, %xmm3, %ecx
-; X64-AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; X64-AVX2-NEXT:    vpextrd $1, %xmm4, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl %ecx
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    vmovd %xmm3, %edi
-; X64-AVX2-NEXT:    vmovd %xmm4, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl %edi
-; X64-AVX2-NEXT:    vmovd %edx, %xmm5
-; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm5, %xmm5
-; X64-AVX2-NEXT:    vpextrd $2, %xmm3, %ecx
-; X64-AVX2-NEXT:    vpextrd $2, %xmm4, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl %ecx
-; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm5
-; X64-AVX2-NEXT:    vpextrd $3, %xmm3, %ecx
-; X64-AVX2-NEXT:    vpextrd $3, %xmm4, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl %ecx
-; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm3
-; X64-AVX2-NEXT:    vpextrd $1, %xmm2, %ecx
-; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl %ecx
-; X64-AVX2-NEXT:    movl %edx, %ecx
-; X64-AVX2-NEXT:    vmovd %xmm2, %edi
-; X64-AVX2-NEXT:    vmovd %xmm1, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl %edi
-; X64-AVX2-NEXT:    vmovd %edx, %xmm4
-; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm4, %xmm4
-; X64-AVX2-NEXT:    vpextrd $2, %xmm2, %ecx
-; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl %ecx
-; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
-; X64-AVX2-NEXT:    vpextrd $3, %xmm2, %ecx
-; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl %ecx
-; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm1
-; X64-AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; X64-AVX2-NEXT:    vmovd %xmm0, %eax
-; X64-AVX2-NEXT:    xorl %edx, %edx
-; X64-AVX2-NEXT:    divl 32(%rsi)
-; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
-; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
-; X64-AVX2-NEXT:    imull $8199, %edx, %eax # imm = 0x2007
-; X64-AVX2-NEXT:    movl %eax, (%rax)
-; X64-AVX2-NEXT:    vmovdqa %ymm0, (%rax)
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-  %a0 = load <9 x i16>, <9 x i16>* %p0, align 64
-  %a1 = load <9 x i32>, <9 x i32>* %p1, align 64
-  %ext0 = zext <9 x i16> %a0 to <9 x i32>
-  %rem = urem <9 x i32> %ext0, %a1
-  %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
-  store <9 x i32> %mul, <9 x i32>* undef, align 64
-  ret void
-}

Removed: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll?rev=368078&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll (removed)
@@ -1,574 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-
-; PR31551
-; Pairs of shufflevector:trunc functions with functional equivalence.
-; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
-
-define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
-; SSE2-LABEL: shuffle_v16i8_to_v8i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    movq %xmm0, (%rsi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: shuffle_v16i8_to_v8i8:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa (%rdi), %xmm0
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    movq %xmm0, (%rsi)
-; SSE42-NEXT:    retq
-;
-; AVX-LABEL: shuffle_v16i8_to_v8i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512-NEXT:    retq
-  %vec = load <16 x i8>, <16 x i8>* %L
-  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  store <8 x i8> %strided.vec, <8 x i8>* %S
-  ret void
-}
-
-define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
-; SSE2-LABEL: trunc_v8i16_to_v8i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    movq %xmm0, (%rsi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: trunc_v8i16_to_v8i8:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa (%rdi), %xmm0
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    movq %xmm0, (%rsi)
-; SSE42-NEXT:    retq
-;
-; AVX-LABEL: trunc_v8i16_to_v8i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v8i16_to_v8i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v8i16_to_v8i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v8i16_to_v8i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-  %vec = load <16 x i8>, <16 x i8>* %L
-  %bc = bitcast <16 x i8> %vec to <8 x i16>
-  %strided.vec = trunc <8 x i16> %bc to <8 x i8>
-  store <8 x i8> %strided.vec, <8 x i8>* %S
-  ret void
-}
-
-define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
-; SSE2-LABEL: shuffle_v8i16_to_v4i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    movq %xmm0, (%rsi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: shuffle_v8i16_to_v4i16:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa (%rdi), %xmm0
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT:    movq %xmm0, (%rsi)
-; SSE42-NEXT:    retq
-;
-; AVX-LABEL: shuffle_v8i16_to_v4i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: shuffle_v8i16_to_v4i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512-NEXT:    retq
-  %vec = load <8 x i16>, <8 x i16>* %L
-  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  store <4 x i16> %strided.vec, <4 x i16>* %S
-  ret void
-}
-
-define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
-; SSE2-LABEL: trunc_v4i32_to_v4i16:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    movq %xmm0, (%rsi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: trunc_v4i32_to_v4i16:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa (%rdi), %xmm0
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT:    movq %xmm0, (%rsi)
-; SSE42-NEXT:    retq
-;
-; AVX-LABEL: trunc_v4i32_to_v4i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v4i32_to_v4i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v4i32_to_v4i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v4i32_to_v4i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-  %vec = load <8 x i16>, <8 x i16>* %L
-  %bc = bitcast <8 x i16> %vec to <4 x i32>
-  %strided.vec = trunc <4 x i32> %bc to <4 x i16>
-  store <4 x i16> %strided.vec, <4 x i16>* %S
-  ret void
-}
-
-define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
-; SSE-LABEL: shuffle_v4i32_to_v2i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; SSE-NEXT:    movq %xmm0, (%rsi)
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: shuffle_v4i32_to_v2i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT:    vmovlps %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: shuffle_v4i32_to_v2i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512-NEXT:    vmovlps %xmm0, (%rsi)
-; AVX512-NEXT:    retq
-  %vec = load <4 x i32>, <4 x i32>* %L
-  %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
-  store <2 x i32> %strided.vec, <2 x i32>* %S
-  ret void
-}
-
-define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
-; SSE-LABEL: trunc_v2i64_to_v2i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; SSE-NEXT:    movq %xmm0, (%rsi)
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: trunc_v2i64_to_v2i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT:    vmovlps %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v2i64_to_v2i32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT:    vmovlps %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v2i64_to_v2i32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vpmovqd %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v2i64_to_v2i32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512BW-NEXT:    vmovlps %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vpmovqd %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-  %vec = load <4 x i32>, <4 x i32>* %L
-  %bc = bitcast <4 x i32> %vec to <2 x i64>
-  %strided.vec = trunc <2 x i64> %bc to <2 x i32>
-  store <2 x i32> %strided.vec, <2 x i32>* %S
-  ret void
-}
-
-define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
-; SSE2-LABEL: shuffle_v16i8_to_v4i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    movd %xmm0, (%rsi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: shuffle_v16i8_to_v4i8:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa (%rdi), %xmm0
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    movd %xmm0, (%rsi)
-; SSE42-NEXT:    retq
-;
-; AVX-LABEL: shuffle_v16i8_to_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512-NEXT:    retq
-  %vec = load <16 x i8>, <16 x i8>* %L
-  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-  store <4 x i8> %strided.vec, <4 x i8>* %S
-  ret void
-}
-
-define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
-; SSE2-LABEL: trunc_v4i32_to_v4i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    movd %xmm0, (%rsi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: trunc_v4i32_to_v4i8:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa (%rdi), %xmm0
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    movd %xmm0, (%rsi)
-; SSE42-NEXT:    retq
-;
-; AVX-LABEL: trunc_v4i32_to_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v4i32_to_v4i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v4i32_to_v4i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v4i32_to_v4i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-  %vec = load <16 x i8>, <16 x i8>* %L
-  %bc = bitcast <16 x i8> %vec to <4 x i32>
-  %strided.vec = trunc <4 x i32> %bc to <4 x i8>
-  store <4 x i8> %strided.vec, <4 x i8>* %S
-  ret void
-}
-
-define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
-; SSE-LABEL: shuffle_v8i16_to_v2i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    movd %xmm0, (%rsi)
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: shuffle_v8i16_to_v2i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BWVL-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-  %vec = load <8 x i16>, <8 x i16>* %L
-  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
-  store <2 x i16> %strided.vec, <2 x i16>* %S
-  ret void
-}
-
-define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
-; SSE-LABEL: trunc_v2i64_to_v2i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    movd %xmm0, (%rsi)
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: trunc_v2i64_to_v2i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vmovd %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v2i64_to_v2i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v2i64_to_v2i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vpmovqw %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v2i64_to_v2i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vpmovqw %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-  %vec = load <8 x i16>, <8 x i16>* %L
-  %bc = bitcast <8 x i16> %vec to <2 x i64>
-  %strided.vec = trunc <2 x i64> %bc to <2 x i16>
-  store <2 x i16> %strided.vec, <2 x i16>* %S
-  ret void
-}
-
-define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
-; SSE2-LABEL: shuffle_v16i8_to_v2i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movw %ax, (%rsi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: shuffle_v16i8_to_v2i8:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa (%rdi), %xmm0
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
-; SSE42-NEXT:    retq
-;
-; AVX-LABEL: shuffle_v16i8_to_v2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
-; AVX512-NEXT:    retq
-  %vec = load <16 x i8>, <16 x i8>* %L
-  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8>
-  store <2 x i8> %strided.vec, <2 x i8>* %S
-  ret void
-}
-
-define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
-; SSE2-LABEL: trunc_v2i64_to_v2i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movw %ax, (%rsi)
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: trunc_v2i64_to_v2i8:
-; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa (%rdi), %xmm0
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
-; SSE42-NEXT:    retq
-;
-; AVX-LABEL: trunc_v2i64_to_v2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v2i64_to_v2i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vpextrw $0, %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v2i64_to_v2i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vpmovqb %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v2i64_to_v2i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpextrw $0, %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vpmovqb %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-  %vec = load <16 x i8>, <16 x i8>* %L
-  %bc = bitcast <16 x i8> %vec to <2 x i64>
-  %strided.vec = trunc <2 x i64> %bc to <2 x i8>
-  store <2 x i8> %strided.vec, <2 x i8>* %S
-  ret void
-}

Removed: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll?rev=368078&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll (removed)
@@ -1,1454 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
-
-; PR31551
-; Pairs of shufflevector:trunc functions with functional equivalence.
-; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
-
-define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512VBMIVL-NEXT:    vpermi2b 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <32 x i8>, <32 x i8>* %L
-  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-  store <16 x i8> %strided.vec, <16 x i8>* %S
-  ret void
-}
-
-define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX1-LABEL: trunc_v16i16_to_v16i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: trunc_v16i16_to_v16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v16i16_to_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vpmovwb %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <32 x i8>, <32 x i8>* %L
-  %bc = bitcast <32 x i8> %vec to <16 x i16>
-  %strided.vec = trunc <16 x i16> %bc to <16 x i8>
-  store <16 x i8> %strided.vec, <16 x i8>* %S
-  ret void
-}
-
-define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX-LABEL: shuffle_v16i16_to_v8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vmovdqa %xmm1, (%rsi)
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
-; AVX512VBMIVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <16 x i16>, <16 x i16>* %L
-  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  store <8 x i16> %strided.vec, <8 x i16>* %S
-  ret void
-}
-
-define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vpmovdw %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <16 x i16>, <16 x i16>* %L
-  %bc = bitcast <16 x i16> %vec to <8 x i32>
-  %strided.vec = trunc <8 x i32> %bc to <8 x i16>
-  store <8 x i16> %strided.vec, <8 x i16>* %S
-  ret void
-}
-
-define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
-; AVX-LABEL: shuffle_v8i32_to_v4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps (%rdi), %xmm0
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: shuffle_v8i32_to_v4i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX512-NEXT:    retq
-  %vec = load <8 x i32>, <8 x i32>* %L
-  %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  store <4 x i32> %strided.vec, <4 x i32>* %S
-  ret void
-}
-
-define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps (%rdi), %xmm0
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX1-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX2-SLOW-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT:    vpermps (%rdi), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovaps %xmm0, (%rsi)
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vpmovqd %ymm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vpmovqd %ymm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vpmovqd %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <8 x i32>, <8 x i32>* %L
-  %bc = bitcast <8 x i32> %vec to <4 x i64>
-  %strided.vec = trunc <4 x i64> %bc to <4 x i32>
-  store <4 x i32> %strided.vec, <4 x i32>* %S
-  ret void
-}
-
-define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v8i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2024390091656922112,2024390091656922112]
-; AVX512VBMIVL-NEXT:    vpermi2b 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT:    vmovq %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <32 x i8>, <32 x i8>* %L
-  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-  store <8 x i8> %strided.vec, <8 x i8>* %S
-  ret void
-}
-
-define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX-LABEL: trunc_v8i32_to_v8i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT:    vmovq %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <32 x i8>, <32 x i8>* %L
-  %bc = bitcast <32 x i8> %vec to <8 x i32>
-  %strided.vec = trunc <8 x i32> %bc to <8 x i8>
-  store <8 x i8> %strided.vec, <8 x i8>* %S
-  ret void
-}
-
-define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
-; IR generated from:
-; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0};
-; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %truncated.vec = trunc <8 x i32> %vec to <8 x i8>
-  %bc = bitcast <8 x i8> %truncated.vec to i64
-  %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
-  ret <2 x i64> %result
-}
-
-define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %truncated = trunc <8 x i32> %vec to <8 x i8>
-  %truncated.ext = zext <8 x i8> %truncated to <8 x i16>
-  %bc = bitcast <8 x i16> %truncated.ext to <16 x i8>
-  %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-  ret <16 x i8> %result
-}
-
-define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %truncated = trunc <8 x i32> %vec to <8 x i16>
-  %bc = bitcast <8 x i16> %truncated to <16 x i8>
-  %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29>
-  ret <16 x i8> %result
-}
-
-define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %truncated = trunc <8 x i32> %vec to <8 x i8>
-  %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %result
-}
-
-define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind {
-; IR generated from:
-; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0};
-; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %truncated = trunc <4 x i64> %vec to <4 x i16>
-  %bc = bitcast <4 x i16> %truncated to i64
-  %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
-  ret <2 x i64> %result
-}
-
-define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %truncated = trunc <4 x i64> %vec to <4 x i16>
-  %truncated.ext = zext <4 x i16> %truncated to <4 x i32>
-  %bc = bitcast <4 x i32> %truncated.ext to <8 x i16>
-  %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-  ret <8 x i16> %result
-}
-
-define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %truncated = trunc <4 x i64> %vec to <4 x i32>
-  %bc = bitcast <4 x i32> %truncated to <8 x i16>
-  %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13>
-  ret <8 x i16> %result
-}
-
-define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-SLOW-NEXT:    vzeroupper
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-FAST-NEXT:    vzeroupper
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %truncated = trunc <4 x i64> %vec to <4 x i16>
-  %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %result
-}
-
-define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovqb %ymm0, %xmm0
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovqb %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VBMIVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %truncated = trunc <4 x i64> %vec to <4 x i8>
-  %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7>
-  ret <16 x i8> %result
-}
-
-define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
-; AVX1-LABEL: shuffle_v16i16_to_v4i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
-; AVX512BWVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vmovq %xmm1, (%rsi)
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
-; AVX512VBMIVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT:    vmovq %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <16 x i16>, <16 x i16>* %L
-  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-  store <4 x i16> %strided.vec, <4 x i16>* %S
-  ret void
-}
-
-define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <16 x i16>, <16 x i16>* %L
-  %bc = bitcast <16 x i16> %vec to <4 x i64>
-  %strided.vec = trunc <4 x i64> %bc to <4 x i16>
-  store <4 x i16> %strided.vec, <4 x i16>* %S
-  ret void
-}
-
-define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BWVL-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [403703808,403703808,403703808,403703808]
-; AVX512VBMIVL-NEXT:    vpermi2b 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT:    vmovd %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <32 x i8>, <32 x i8>* %L
-  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
-  store <4 x i8> %strided.vec, <4 x i8>* %S
-  ret void
-}
-
-define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX-LABEL: trunc_v4i64_to_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vmovd %xmm0, (%rsi)
-; AVX-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <32 x i8>, <32 x i8>* %L
-  %bc = bitcast <32 x i8> %vec to <4 x i64>
-  %strided.vec = trunc <4 x i64> %bc to <4 x i8>
-  store <4 x i8> %strided.vec, <4 x i8>* %S
-  ret void
-}
-
-; In this case not all elements are collected from the same source vector, so
-; the resulting BUILD_VECTOR should not be combined to a truncate.
-define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
-; AVX1-LABEL: negative:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
-; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: negative:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: negative:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: negative:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: negative:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: negative:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX512BWVL-NEXT:    movl $65537, %eax # imm = 0x10001
-; AVX512BWVL-NEXT:    kmovd %eax, %k1
-; AVX512BWVL-NEXT:    vmovdqu8 %ymm1, %ymm0 {%k1}
-; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: negative:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,48,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX512VBMIVL-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
-; AVX512VBMIVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-  %w0 = extractelement <32 x i8> %w, i32 0
-  %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0
-  ret <16 x i8> %merged
-}

Removed: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll?rev=368078&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll (removed)
@@ -1,903 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
-
-; PR31551
-; Pairs of shufflevector:trunc functions with functional equivalence.
-; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
-
-define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
-; AVX512VL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
-; AVX512BWVL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VBMI-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VBMI-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512VBMI-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VBMI-NEXT:    vmovdqa %ymm0, (%rsi)
-; AVX512VBMI-NEXT:    vzeroupper
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
-; AVX512VBMIVL-NEXT:    vpermi2b 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa %ymm1, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <64 x i8>, <64 x i8>* %L
-  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
-  store <32 x i8> %strided.vec, <32 x i8>* %S
-  ret void
-}
-
-define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
-; AVX512F-LABEL: trunc_v32i16_to_v32i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT:    vpmovdb %zmm1, 16(%rsi)
-; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512VL-NEXT:    vpmovdb %zmm1, 16(%rsi)
-; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMI-NEXT:    vpmovwb %zmm0, (%rsi)
-; AVX512VBMI-NEXT:    vzeroupper
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMIVL-NEXT:    vpmovwb %zmm0, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <64 x i8>, <64 x i8>* %L
-  %bc = bitcast <64 x i8> %vec to <32 x i16>
-  %strided.vec = trunc <32 x i16> %bc to <32 x i8>
-  store <32 x i8> %strided.vec, <32 x i8>* %S
-  ret void
-}
-
-define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512F-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512F-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512F-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT:    vmovaps %ymm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
-; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT:    vmovdqa %ymm2, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
-; AVX512BW-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512BW-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512BW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT:    vmovaps %ymm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa %ymm1, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
-; AVX512VBMI-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512VBMI-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
-; AVX512VBMI-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512VBMI-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsi)
-; AVX512VBMI-NEXT:    vzeroupper
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512VBMIVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa %ymm1, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <32 x i16>, <32 x i16>* %L
-  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
-  store <16 x i16> %strided.vec, <16 x i16>* %S
-  ret void
-}
-
-define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
-; AVX512-LABEL: trunc_v16i32_to_v16i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT:    vpmovdw %zmm0, (%rsi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %vec = load <32 x i16>, <32 x i16>* %L
-  %bc = bitcast <32 x i16> %vec to <16 x i32>
-  %strided.vec = trunc <16 x i32> %bc to <16 x i16>
-  store <16 x i16> %strided.vec, <16 x i16>* %S
-  ret void
-}
-
-define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512F-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
-; AVX512F-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT:    vmovaps %ymm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
-; AVX512VL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
-; AVX512VL-NEXT:    vmovdqa %ymm1, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
-; AVX512BW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT:    vmovaps %ymm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa %ymm1, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512VBMI-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
-; AVX512VBMI-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VBMI-NEXT:    vmovaps %ymm0, (%rsi)
-; AVX512VBMI-NEXT:    vzeroupper
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
-; AVX512VBMIVL-NEXT:    vpermi2d 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa %ymm1, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <16 x i32>, <16 x i32>* %L
-  %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  store <8 x i32> %strided.vec, <8 x i32>* %S
-  ret void
-}
-
-define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
-; AVX512-LABEL: trunc_v8i64_to_v8i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT:    vpmovqd %zmm0, (%rsi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %vec = load <16 x i32>, <16 x i32>* %L
-  %bc = bitcast <16 x i32> %vec to <8 x i64>
-  %strided.vec = trunc <8 x i64> %bc to <8 x i32>
-  store <8 x i32> %strided.vec, <8 x i32>* %S
-  ret void
-}
-
-define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BWVL-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMI-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512VBMI-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512VBMIVL-NEXT:    vpermt2b 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <64 x i8>, <64 x i8>* %L
-  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
-  store <16 x i8> %strided.vec, <16 x i8>* %S
-  ret void
-}
-
-define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX512-LABEL: trunc_v16i32_to_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, (%rsi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %vec = load <64 x i8>, <64 x i8>* %L
-  %bc = bitcast <64 x i8> %vec to <16 x i32>
-  %strided.vec = trunc <16 x i32> %bc to <16 x i8>
-  store <16 x i8> %strided.vec, <16 x i8>* %S
-  ret void
-}
-
-define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
-; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BWVL-NEXT:    vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa %xmm1, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMI-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512VBMI-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512VBMIVL-NEXT:    vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <32 x i16>, <32 x i16>* %L
-  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
-  store <8 x i16> %strided.vec, <8 x i16>* %S
-  ret void
-}
-
-define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX512-LABEL: trunc_v8i64_to_v8i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT:    vpmovqw %zmm0, (%rsi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %vec = load <32 x i16>, <32 x i16>* %L
-  %bc = bitcast <32 x i16> %vec to <8 x i64>
-  %strided.vec = trunc <8 x i64> %bc to <8 x i16>
-  store <8 x i16> %strided.vec, <8 x i16>* %S
-  ret void
-}
-
-define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BWVL-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BWVL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BWVL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMI-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512VBMI-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
-; AVX512VBMIVL-NEXT:    vpermi2b 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovq %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %vec = load <64 x i8>, <64 x i8>* %L
-  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
-  store <8 x i8> %strided.vec, <8 x i8>* %S
-  ret void
-}
-
-define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512-LABEL: trunc_v8i64_to_v8i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT:    vpmovqb %zmm0, (%rsi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %vec = load <64 x i8>, <64 x i8>* %L
-  %bc = bitcast <64 x i8> %vec to <8 x i64>
-  %strided.vec = trunc <8 x i64> %bc to <8 x i8>
-  store <8 x i8> %strided.vec, <8 x i8>* %S
-  ret void
-}
-
-define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
-; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
-; AVX512VBMI-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMI-NEXT:    vzeroupper
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
-; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMIVL-NEXT:    vpermt2b %ymm2, %ymm1, %ymm0
-; AVX512VBMIVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
-; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
-; AVX512VBMI-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMI-NEXT:    vzeroupper
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
-; AVX512VBMIVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMIVL-NEXT:    vpermt2b %ymm2, %ymm1, %ymm0
-; AVX512VBMIVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
-  %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
-  ret <16 x i8> %res
-}
-
-define <4 x double> @PR34175(<32 x i16>* %p) {
-; AVX512F-LABEL: PR34175:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqu 32(%rdi), %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512F-NEXT:    vpbroadcastd %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: PR34175:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqu 32(%rdi), %xmm1
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,4,2,3]
-; AVX512VL-NEXT:    vpermi2d %xmm1, %xmm0, %xmm2
-; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512VL-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: PR34175:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqu 32(%rdi), %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512BW-NEXT:    vpbroadcastd %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BW-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: PR34175:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
-; AVX512BWVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512BWVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: PR34175:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512VBMI-NEXT:    vmovdqu 32(%rdi), %xmm1
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512VBMI-NEXT:    vpbroadcastd %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512VBMI-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512VBMI-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: PR34175:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
-; AVX512VBMIVL-NEXT:    vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512VBMIVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX512VBMIVL-NEXT:    retq
-  %v = load <32 x i16>, <32 x i16>* %p, align 2
-  %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
-  %tofp = uitofp <4 x i16> %shuf to <4 x double>
-  ret <4 x double> %tofp
-}
-
-define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
-; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %truncated = trunc <8 x i64> %vec to <8 x i8>
-  %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i8> %result
-}
-

Modified: llvm/trunk/test/CodeGen/X86/vec_cast2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_cast2.ll?rev=368079&r1=368078&r2=368079&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_cast2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_cast2.ll Tue Aug  6 13:12:20 2019
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
 
 define <8 x float> @cvt_v8i8_v8f32(<8 x i8> %src) {
 ; CHECK-LABEL: cvt_v8i8_v8f32:
@@ -11,15 +10,6 @@ define <8 x float> @cvt_v8i8_v8f32(<8 x
 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v8i8_v8f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovsxbd %xmm0, %xmm1
-; CHECK-WIDE-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; CHECK-WIDE-NEXT:    vpmovsxbd %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    retl
   %res = sitofp <8 x i8> %src to <8 x float>
   ret <8 x float> %res
 }
@@ -33,15 +23,6 @@ define <8 x float> @cvt_v8i16_v8f32(<8 x
 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v8i16_v8f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovsxwd %xmm0, %xmm1
-; CHECK-WIDE-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; CHECK-WIDE-NEXT:    vpmovsxwd %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    retl
   %res = sitofp <8 x i16> %src to <8 x float>
   ret <8 x float> %res
 }
@@ -52,12 +33,6 @@ define <4 x float> @cvt_v4i8_v4f32(<4 x
 ; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v4i8_v4f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovsxbd %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = sitofp <4 x i8> %src to <4 x float>
   ret <4 x float> %res
 }
@@ -68,12 +43,6 @@ define <4 x float> @cvt_v4i16_v4f32(<4 x
 ; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v4i16_v4f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovsxwd %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = sitofp <4 x i16> %src to <4 x float>
   ret <4 x float> %res
 }
@@ -87,15 +56,6 @@ define <8 x float> @cvt_v8u8_v8f32(<8 x
 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v8u8_v8f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-WIDE-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; CHECK-WIDE-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    retl
   %res = uitofp <8 x i8> %src to <8 x float>
   ret <8 x float> %res
 }
@@ -109,15 +69,6 @@ define <8 x float> @cvt_v8u16_v8f32(<8 x
 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v8u16_v8f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; CHECK-WIDE-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    retl
   %res = uitofp <8 x i16> %src to <8 x float>
   ret <8 x float> %res
 }
@@ -128,12 +79,6 @@ define <4 x float> @cvt_v4u8_v4f32(<4 x
 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v4u8_v4f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = uitofp <4 x i8> %src to <4 x float>
   ret <4 x float> %res
 }
@@ -144,12 +89,6 @@ define <4 x float> @cvt_v4u16_v4f32(<4 x
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v4u16_v4f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = uitofp <4 x i16> %src to <4 x float>
   ret <4 x float> %res
 }
@@ -163,15 +102,6 @@ define <8 x i8> @cvt_v8f32_v8i8(<8 x flo
 ; CHECK-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v8f32_v8i8:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vzeroupper
-; CHECK-WIDE-NEXT:    retl
   %res = fptosi <8 x float> %src to <8 x i8>
   ret <8 x i8> %res
 }
@@ -184,14 +114,6 @@ define <8 x i16> @cvt_v8f32_v8i16(<8 x f
 ; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v8f32_v8i16:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vzeroupper
-; CHECK-WIDE-NEXT:    retl
   %res = fptosi <8 x float> %src to <8 x i16>
   ret <8 x i16> %res
 }
@@ -202,12 +124,6 @@ define <4 x i8> @cvt_v4f32_v4i8(<4 x flo
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v4f32_v4i8:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; CHECK-WIDE-NEXT:    retl
   %res = fptosi <4 x float> %src to <4 x i8>
   ret <4 x i8> %res
 }
@@ -218,12 +134,6 @@ define <4 x i16> @cvt_v4f32_v4i16(<4 x f
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v4f32_v4i16:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = fptosi <4 x float> %src to <4 x i16>
   ret <4 x i16> %res
 }
@@ -237,15 +147,6 @@ define <8 x i8> @cvt_v8f32_v8u8(<8 x flo
 ; CHECK-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v8f32_v8u8:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vzeroupper
-; CHECK-WIDE-NEXT:    retl
   %res = fptoui <8 x float> %src to <8 x i8>
   ret <8 x i8> %res
 }
@@ -258,14 +159,6 @@ define <8 x i16> @cvt_v8f32_v8u16(<8 x f
 ; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v8f32_v8u16:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vzeroupper
-; CHECK-WIDE-NEXT:    retl
   %res = fptoui <8 x float> %src to <8 x i16>
   ret <8 x i16> %res
 }
@@ -276,12 +169,6 @@ define <4 x i8> @cvt_v4f32_v4u8(<4 x flo
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v4f32_v4u8:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; CHECK-WIDE-NEXT:    retl
   %res = fptoui <4 x float> %src to <4 x i8>
   ret <4 x i8> %res
 }
@@ -292,12 +179,6 @@ define <4 x i16> @cvt_v4f32_v4u16(<4 x f
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v4f32_v4u16:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = fptoui <4 x float> %src to <4 x i16>
   ret <4 x i16> %res
 }

Modified: llvm/trunk/test/CodeGen/X86/vec_cast3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_cast3.ll?rev=368079&r1=368078&r2=368079&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_cast3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_cast3.ll Tue Aug  6 13:12:20 2019
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
 
 define <2 x float> @cvt_v2i8_v2f32(<2 x i8> %src) {
 ; CHECK-LABEL: cvt_v2i8_v2f32:
@@ -8,12 +7,6 @@ define <2 x float> @cvt_v2i8_v2f32(<2 x
 ; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2i8_v2f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovsxbd %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = sitofp <2 x i8> %src to <2 x float>
   ret <2 x float> %res
 }
@@ -24,12 +17,6 @@ define <2 x float> @cvt_v2i16_v2f32(<2 x
 ; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2i16_v2f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovsxwd %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = sitofp <2 x i16> %src to <2 x float>
   ret <2 x float> %res
 }
@@ -39,11 +26,6 @@ define <2 x float> @cvt_v2i32_v2f32(<2 x
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2i32_v2f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = sitofp <2 x i32> %src to <2 x float>
   ret <2 x float> %res
 }
@@ -54,12 +36,6 @@ define <2 x float> @cvt_v2u8_v2f32(<2 x
 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2u8_v2f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = uitofp <2 x i8> %src to <2 x float>
   ret <2 x float> %res
 }
@@ -70,12 +46,6 @@ define <2 x float> @cvt_v2u16_v2f32(<2 x
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2u16_v2f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-WIDE-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = uitofp <2 x i16> %src to <2 x float>
   ret <2 x float> %res
 }
@@ -89,15 +59,6 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x
 ; CHECK-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vcvtpd2ps %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2u32_v2f32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; CHECK-WIDE-NEXT:    vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; CHECK-WIDE-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vsubpd %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vcvtpd2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = uitofp <2 x i32> %src to <2 x float>
   ret <2 x float> %res
 }
@@ -108,12 +69,6 @@ define <2 x i8> @cvt_v2f32_v2i8(<2 x flo
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2i8:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; CHECK-WIDE-NEXT:    retl
   %res = fptosi <2 x float> %src to <2 x i8>
   ret <2 x i8> %res
 }
@@ -124,12 +79,6 @@ define <2 x i16> @cvt_v2f32_v2i16(<2 x f
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2i16:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-WIDE-NEXT:    retl
   %res = fptosi <2 x float> %src to <2 x i16>
   ret <2 x i16> %res
 }
@@ -139,11 +88,6 @@ define <2 x i32> @cvt_v2f32_v2i32(<2 x f
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2i32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = fptosi <2 x float> %src to <2 x i32>
   ret <2 x i32> %res
 }
@@ -154,12 +98,6 @@ define <2 x i8> @cvt_v2f32_v2u8(<2 x flo
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2u8:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; CHECK-WIDE-NEXT:    retl
   %res = fptoui <2 x float> %src to <2 x i8>
   ret <2 x i8> %res
 }
@@ -170,12 +108,6 @@ define <2 x i16> @cvt_v2f32_v2u16(<2 x f
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2u16:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-WIDE-NEXT:    retl
   %res = fptoui <2 x float> %src to <2 x i16>
   ret <2 x i16> %res
 }
@@ -191,17 +123,6 @@ define <2 x i32> @cvt_v2f32_v2u32(<2 x f
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2u32:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; CHECK-WIDE-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
-; CHECK-WIDE-NEXT:    vsubps %xmm1, %xmm0, %xmm1
-; CHECK-WIDE-NEXT:    vcvttps2dq %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vxorps LCPI11_1, %xmm1, %xmm1
-; CHECK-WIDE-NEXT:    vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-WIDE-NEXT:    retl
   %res = fptoui <2 x float> %src to <2 x i32>
   ret <2 x i32> %res
 }
@@ -214,14 +135,6 @@ define <32 x i8> @PR40146(<4 x i64> %x)
 ; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
-;
-; CHECK-WIDE-LABEL: PR40146:
-; CHECK-WIDE:       ## %bb.0:
-; CHECK-WIDE-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; CHECK-WIDE-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-WIDE-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-WIDE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-WIDE-NEXT:    retl
   %perm = shufflevector <4 x i64> %x, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
   %t1 = bitcast <4 x i64> %perm to <32 x i8>
   %t2 = shufflevector <32 x i8> %t1, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 32, i32 1, i32 32, i32 2, i32 32, i32 3, i32 32, i32 4, i32 32, i32 5, i32 32, i32 6, i32 32, i32 7, i32 32, i32 16, i32 48, i32 17, i32 48, i32 18, i32 48, i32 19, i32 48, i32 20, i32 48, i32 21, i32 48, i32 22, i32 48, i32 23, i32 48>

Removed: llvm/trunk/test/CodeGen/X86/vec_clz.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_clz.ll?rev=368078&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_clz.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_clz.ll (removed)
@@ -1,85 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-experimental-vector-widening-legalization | FileCheck %s
-
-declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1 immarg)
-
-define <2 x i32> @illegal_ctlz(<2 x i32> %v1) {
-; CHECK-LABEL: illegal_ctlz:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrld $1, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrld $2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrld $4, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrld $8, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT:    pxor %xmm1, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    psrlw $1, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    psubb %xmm0, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    psrlw $2, %xmm2
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    paddb %xmm1, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    psrlw $4, %xmm0
-; CHECK-NEXT:    paddb %xmm2, %xmm0
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; CHECK-NEXT:    psadbw %xmm1, %xmm2
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    psadbw %xmm1, %xmm0
-; CHECK-NEXT:    packuswb %xmm2, %xmm0
-; CHECK-NEXT:    retq
-  %v2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %v1, i1 true)
-  ret <2 x i32> %v2
-}
-
-declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1 immarg)
-
-define <2 x i32> @illegal_cttz(<2 x i32> %v1) {
-; CHECK-LABEL: illegal_cttz:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT:    paddd %xmm0, %xmm1
-; CHECK-NEXT:    pandn %xmm1, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlw $1, %xmm1
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT:    psubb %xmm1, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
-; CHECK-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm1, %xmm2
-; CHECK-NEXT:    psrlw $2, %xmm0
-; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    paddb %xmm2, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrlw $4, %xmm1
-; CHECK-NEXT:    paddb %xmm0, %xmm1
-; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; CHECK-NEXT:    psadbw %xmm0, %xmm2
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT:    psadbw %xmm0, %xmm1
-; CHECK-NEXT:    packuswb %xmm2, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    retq
-  %v2 = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %v1, i1 true)
-  ret <2 x i32> %v2
-}

Removed: llvm/trunk/test/CodeGen/X86/vec_fp_to_int-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_fp_to_int-widen.ll?rev=368078&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_fp_to_int-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_fp_to_int-widen.ll (removed)
@@ -1,2794 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VLDQ
-;
-; 32-bit tests to make sure we're not doing anything stupid.
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2
-
-;
-; Double to Signed Integer
-;
-
-define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
-; SSE-LABEL: fptosi_2f64_to_2i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: fptosi_2f64_to_2i64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vcvttsd2si %xmm0, %rax
-; VEX-NEXT:    vmovq %rax, %xmm1
-; VEX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; VEX-NEXT:    vcvttsd2si %xmm0, %rax
-; VEX-NEXT:    vmovq %rax, %xmm0
-; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: fptosi_2f64_to_2i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptosi_2f64_to_2i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptosi_2f64_to_2i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttpd2qq %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptosi <2 x double> %a to <2 x i64>
-  ret <2 x i64> %cvt
-}
-
-define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) {
-; SSE-LABEL: fptosi_2f64_to_4i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2f64_to_4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %cvt = fptosi <2 x double> %a to <2 x i32>
-  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %ext
-}
-
-define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) {
-; SSE-LABEL: fptosi_2f64_to_2i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2f64_to_2i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %cvt = fptosi <2 x double> %a to <2 x i32>
-  ret <2 x i32> %cvt
-}
-
-define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
-; SSE-LABEL: fptosi_4f64_to_2i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_4f64_to_2i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-  %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %cvt = fptosi <4 x double> %ext to <4 x i32>
-  ret <4 x i32> %cvt
-}
-
-define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
-; SSE-LABEL: fptosi_4f64_to_4i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm2
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE-NEXT:    cvttsd2si %xmm1, %rax
-; SSE-NEXT:    movq %rax, %xmm3
-; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT:    cvttsd2si %xmm1, %rax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptosi_4f64_to_4i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX1-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptosi_4f64_to_4i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptosi_4f64_to_4i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptosi_4f64_to_4i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT:    vcvttsd2si %xmm1, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT:    vcvttsd2si %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptosi_4f64_to_4i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptosi_4f64_to_4i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttpd2qq %ymm0, %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptosi <4 x double> %a to <4 x i64>
-  ret <4 x i64> %cvt
-}
-
-define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) {
-; SSE-LABEL: fptosi_4f64_to_4i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttpd2dq %xmm1, %xmm1
-; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_4f64_to_4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-  %cvt = fptosi <4 x double> %a to <4 x i32>
-  ret <4 x i32> %cvt
-}
-
-;
-; Double to Unsigned Integer
-;
-
-define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_2i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT:    movapd %xmm0, %xmm1
-; SSE-NEXT:    subsd %xmm2, %xmm1
-; SSE-NEXT:    cvttsd2si %xmm1, %rax
-; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttsd2si %xmm0, %rdx
-; SSE-NEXT:    ucomisd %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rdx
-; SSE-NEXT:    movq %rdx, %xmm1
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    movapd %xmm0, %xmm3
-; SSE-NEXT:    subsd %xmm2, %xmm3
-; SSE-NEXT:    cvttsd2si %xmm3, %rax
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE-NEXT:    ucomisd %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rcx
-; SSE-NEXT:    movq %rcx, %xmm0
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: fptoui_2f64_to_2i64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; VEX-NEXT:    vsubsd %xmm1, %xmm0, %xmm2
-; VEX-NEXT:    vcvttsd2si %xmm2, %rax
-; VEX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttsd2si %xmm0, %rdx
-; VEX-NEXT:    vucomisd %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rdx
-; VEX-NEXT:    vmovq %rdx, %xmm2
-; VEX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; VEX-NEXT:    vsubsd %xmm1, %xmm0, %xmm3
-; VEX-NEXT:    vcvttsd2si %xmm3, %rax
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttsd2si %xmm0, %rcx
-; VEX-NEXT:    vucomisd %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rcx
-; VEX-NEXT:    vmovq %rcx, %xmm0
-; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_2f64_to_2i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvttsd2usi %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT:    vcvttsd2usi %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_2f64_to_2i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttsd2usi %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT:    vcvttsd2usi %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_2f64_to_2i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvttpd2uqq %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttpd2uqq %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptoui <2 x double> %a to <2 x i64>
-  ret <2 x i64> %cvt
-}
-
-define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_4i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    movd %ecx, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptoui_2f64_to_4i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT:    vpackssdw %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm3
-; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_2f64_to_4i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpackssdw %xmm0, %xmm2, %xmm2
-; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_2f64_to_4i32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_2f64_to_4i32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttpd2udq %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_2f64_to_4i32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttpd2udq %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptoui <2 x double> %a to <2 x i32>
-  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %ext
-}
-
-define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_2i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptoui_2f64_to_2i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm3
-; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_2f64_to_2i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_2f64_to_2i32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_2f64_to_2i32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttpd2udq %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_2f64_to_2i32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttpd2udq %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptoui <2 x double> %a to <2 x i32>
-  %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  ret <4 x i32> %ext
-}
-
-define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
-; SSE-LABEL: fptoui_4f64_to_2i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptoui_4f64_to_2i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovapd %xmm0, %xmm0
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
-; AVX1-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_4f64_to_2i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovapd %xmm0, %xmm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_4f64_to_2i32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps %xmm0, %xmm0
-; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_4f64_to_2i32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps %xmm0, %xmm0
-; AVX512VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_4f64_to_2i32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps %xmm0, %xmm0
-; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vmovaps %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    vcvttpd2udq %ymm0, %xmm0
-; AVX512VLDQ-NEXT:    vzeroupper
-; AVX512VLDQ-NEXT:    retq
-  %ext = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cvt = fptoui <4 x double> %ext to <4 x i32>
-  ret <4 x i32> %cvt
-}
-
-define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
-; SSE-LABEL: fptoui_4f64_to_4i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movapd %xmm0, %xmm2
-; SSE-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
-; SSE-NEXT:    subsd %xmm3, %xmm0
-; SSE-NEXT:    cvttsd2si %xmm0, %rcx
-; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttsd2si %xmm2, %rdx
-; SSE-NEXT:    ucomisd %xmm3, %xmm2
-; SSE-NEXT:    cmovaeq %rcx, %rdx
-; SSE-NEXT:    movq %rdx, %xmm0
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-NEXT:    movapd %xmm2, %xmm4
-; SSE-NEXT:    subsd %xmm3, %xmm4
-; SSE-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttsd2si %xmm2, %rdx
-; SSE-NEXT:    ucomisd %xmm3, %xmm2
-; SSE-NEXT:    cmovaeq %rcx, %rdx
-; SSE-NEXT:    movq %rdx, %xmm2
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT:    movapd %xmm1, %xmm2
-; SSE-NEXT:    subsd %xmm3, %xmm2
-; SSE-NEXT:    cvttsd2si %xmm2, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttsd2si %xmm1, %rdx
-; SSE-NEXT:    ucomisd %xmm3, %xmm1
-; SSE-NEXT:    cmovaeq %rcx, %rdx
-; SSE-NEXT:    movq %rdx, %xmm2
-; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT:    movapd %xmm1, %xmm4
-; SSE-NEXT:    subsd %xmm3, %xmm4
-; SSE-NEXT:    cvttsd2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttsd2si %xmm1, %rax
-; SSE-NEXT:    ucomisd %xmm3, %xmm1
-; SSE-NEXT:    cmovaeq %rcx, %rax
-; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptoui_4f64_to_4i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT:    vsubsd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vcvttsd2si %xmm3, %rax
-; AVX1-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttsd2si %xmm2, %rdx
-; AVX1-NEXT:    vucomisd %xmm1, %xmm2
-; AVX1-NEXT:    cmovaeq %rax, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm3
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX1-NEXT:    vsubsd %xmm1, %xmm2, %xmm4
-; AVX1-NEXT:    vcvttsd2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttsd2si %xmm2, %rdx
-; AVX1-NEXT:    vucomisd %xmm1, %xmm2
-; AVX1-NEXT:    cmovaeq %rax, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT:    vsubsd %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vcvttsd2si %xmm3, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttsd2si %xmm0, %rdx
-; AVX1-NEXT:    vucomisd %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm3
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT:    vsubsd %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vcvttsd2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttsd2si %xmm0, %rcx
-; AVX1-NEXT:    vucomisd %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_4f64_to_4i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vsubsd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vcvttsd2si %xmm3, %rax
-; AVX2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttsd2si %xmm2, %rdx
-; AVX2-NEXT:    vucomisd %xmm1, %xmm2
-; AVX2-NEXT:    cmovaeq %rax, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm3
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX2-NEXT:    vsubsd %xmm1, %xmm2, %xmm4
-; AVX2-NEXT:    vcvttsd2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttsd2si %xmm2, %rdx
-; AVX2-NEXT:    vucomisd %xmm1, %xmm2
-; AVX2-NEXT:    cmovaeq %rax, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vcvttsd2si %xmm3, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttsd2si %xmm0, %rdx
-; AVX2-NEXT:    vucomisd %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm3
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm4
-; AVX2-NEXT:    vcvttsd2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
-; AVX2-NEXT:    vucomisd %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rcx
-; AVX2-NEXT:    vmovq %rcx, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_4f64_to_4i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vcvttsd2usi %xmm1, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT:    vcvttsd2usi %xmm1, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT:    vcvttsd2usi %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT:    vcvttsd2usi %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_4f64_to_4i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vcvttsd2usi %xmm1, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT:    vcvttsd2usi %xmm1, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT:    vcvttsd2usi %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT:    vcvttsd2usi %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_4f64_to_4i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT:    vcvttpd2uqq %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f64_to_4i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttpd2uqq %ymm0, %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptoui <4 x double> %a to <4 x i64>
-  ret <4 x i64> %cvt
-}
-
-define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
-; SSE-LABEL: fptoui_4f64_to_4i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttsd2si %xmm1, %rax
-; SSE-NEXT:    movd %eax, %xmm2
-; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT:    cvttsd2si %xmm1, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptoui_4f64_to_4i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
-; AVX1-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX1-NEXT:    vxorpd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_4f64_to_4i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT:    vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_4f64_to_4i32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_4f64_to_4i32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_4f64_to_4i32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttpd2udq %ymm0, %xmm0
-; AVX512VLDQ-NEXT:    vzeroupper
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptoui <4 x double> %a to <4 x i32>
-  ret <4 x i32> %cvt
-}
-
-;
-; Float to Signed Integer
-;
-
-define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) {
-; SSE-LABEL: fptosi_2f32_to_2i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2f32_to_2i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %cvt = fptosi <2 x float> %a to <2 x i32>
-  ret <2 x i32> %cvt
-}
-
-define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) {
-; SSE-LABEL: fptosi_4f32_to_4i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_4f32_to_4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %cvt = fptosi <4 x float> %a to <4 x i32>
-  ret <4 x i32> %cvt
-}
-
-define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
-; SSE-LABEL: fptosi_2f32_to_2i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: fptosi_2f32_to_2i64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vcvttss2si %xmm0, %rax
-; VEX-NEXT:    vmovq %rax, %xmm1
-; VEX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; VEX-NEXT:    vcvttss2si %xmm0, %rax
-; VEX-NEXT:    vmovq %rax, %xmm0
-; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: fptosi_2f32_to_2i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptosi_2f32_to_2i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptosi_2f32_to_2i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
-  %cvt = fptosi <2 x float> %shuf to <2 x i64>
-  ret <2 x i64> %cvt
-}
-
-define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
-; SSE-LABEL: fptosi_4f32_to_2i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: fptosi_4f32_to_2i64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; VEX-NEXT:    vcvttss2si %xmm1, %rax
-; VEX-NEXT:    vcvttss2si %xmm0, %rcx
-; VEX-NEXT:    vmovq %rcx, %xmm0
-; VEX-NEXT:    vmovq %rax, %xmm1
-; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: fptosi_4f32_to_2i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvttss2si %xmm1, %rax
-; AVX512F-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX512F-NEXT:    vmovq %rcx, %xmm0
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptosi_4f32_to_2i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvttss2si %xmm1, %rax
-; AVX512VL-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX512VL-NEXT:    vmovq %rcx, %xmm0
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptosi_4f32_to_2i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
-; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512VLDQ-NEXT:    vzeroupper
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptosi <4 x float> %a to <4 x i64>
-  %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i64> %shuf
-}
-
-define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) {
-; SSE-LABEL: fptosi_8f32_to_8i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_8f32_to_8i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttps2dq %ymm0, %ymm0
-; AVX-NEXT:    retq
-  %cvt = fptosi <8 x float> %a to <8 x i32>
-  ret <8 x i32> %cvt
-}
-
-define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
-; SSE-LABEL: fptosi_4f32_to_4i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm2
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movq %rax, %xmm3
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptosi_4f32_to_4i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT:    vcvttss2si %xmm1, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm1
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1-NEXT:    vcvttss2si %xmm2, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT:    vcvttss2si %xmm0, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vcvttss2si %xmm0, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptosi_4f32_to_4i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vcvttss2si %xmm1, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX2-NEXT:    vcvttss2si %xmm2, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vcvttss2si %xmm0, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vcvttss2si %xmm0, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptosi_4f32_to_4i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT:    vcvttss2si %xmm1, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512F-NEXT:    vcvttss2si %xmm2, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptosi_4f32_to_4i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT:    vcvttss2si %xmm1, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512VL-NEXT:    vcvttss2si %xmm2, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptosi_4f32_to_4i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptosi_4f32_to_4i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cvt = fptosi <4 x float> %shuf to <4 x i64>
-  ret <4 x i64> %cvt
-}
-
-define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
-; SSE-LABEL: fptosi_8f32_to_4i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm2
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movq %rax, %xmm3
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptosi_8f32_to_4i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT:    vcvttss2si %xmm1, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm1
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1-NEXT:    vcvttss2si %xmm2, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT:    vcvttss2si %xmm0, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm2
-; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vcvttss2si %xmm0, %rax
-; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptosi_8f32_to_4i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vcvttss2si %xmm1, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm1
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX2-NEXT:    vcvttss2si %xmm2, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vcvttss2si %xmm0, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm2
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vcvttss2si %xmm0, %rax
-; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptosi_8f32_to_4i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvttss2si %xmm1, %rax
-; AVX512F-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT:    vcvttss2si %xmm1, %rdx
-; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT:    vcvttss2si %xmm0, %rsi
-; AVX512F-NEXT:    vmovq %rsi, %xmm0
-; AVX512F-NEXT:    vmovq %rdx, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vmovq %rcx, %xmm1
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptosi_8f32_to_4i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvttss2si %xmm1, %rax
-; AVX512VL-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT:    vcvttss2si %xmm1, %rdx
-; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT:    vcvttss2si %xmm0, %rsi
-; AVX512VL-NEXT:    vmovq %rsi, %xmm0
-; AVX512VL-NEXT:    vmovq %rdx, %xmm1
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    vmovq %rcx, %xmm1
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptosi_8f32_to_4i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptosi_8f32_to_4i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2qq %ymm0, %zmm0
-; AVX512VLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptosi <8 x float> %a to <8 x i64>
-  %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i64> %shuf
-}
-
-;
-; Float to Unsigned Integer
-;
-
-define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
-; SSE-LABEL: fptoui_2f32_to_2i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    cmpltps %xmm2, %xmm1
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
-; SSE-NEXT:    subps %xmm2, %xmm0
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    xorps {{.*}}(%rip), %xmm0
-; SSE-NEXT:    andps %xmm1, %xmm3
-; SSE-NEXT:    andnps %xmm0, %xmm1
-; SSE-NEXT:    orps %xmm3, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptoui_2f32_to_2i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_2f32_to_2i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX2-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorps %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_2f32_to_2i32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_2f32_to_2i32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttps2udq %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_2f32_to_2i32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2udq %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptoui <2 x float> %a to <2 x i32>
-  ret <2 x i32> %cvt
-}
-
-define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
-; SSE-LABEL: fptoui_4f32_to_4i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    cmpltps %xmm2, %xmm1
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
-; SSE-NEXT:    subps %xmm2, %xmm0
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    xorps {{.*}}(%rip), %xmm0
-; SSE-NEXT:    andps %xmm1, %xmm3
-; SSE-NEXT:    andnps %xmm0, %xmm1
-; SSE-NEXT:    orps %xmm3, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptoui_4f32_to_4i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT:    vxorps {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_4f32_to_4i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX2-NEXT:    vcmpltps %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
-; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorps %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_4f32_to_4i32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_4f32_to_4i32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttps2udq %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_4f32_to_4i32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2udq %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptoui <4 x float> %a to <4 x i32>
-  ret <4 x i32> %cvt
-}
-
-define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
-; SSE-LABEL: fptoui_2f32_to_2i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    subss %xmm2, %xmm1
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttss2si %xmm0, %rdx
-; SSE-NEXT:    ucomiss %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rdx
-; SSE-NEXT:    movq %rdx, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    subss %xmm2, %xmm3
-; SSE-NEXT:    cvttss2si %xmm3, %rax
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttss2si %xmm0, %rcx
-; SSE-NEXT:    ucomiss %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rcx
-; SSE-NEXT:    movq %rcx, %xmm0
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: fptoui_2f32_to_2i64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm2
-; VEX-NEXT:    vcvttss2si %xmm2, %rax
-; VEX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttss2si %xmm0, %rdx
-; VEX-NEXT:    vucomiss %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rdx
-; VEX-NEXT:    vmovq %rdx, %xmm2
-; VEX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm3
-; VEX-NEXT:    vcvttss2si %xmm3, %rax
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttss2si %xmm0, %rcx
-; VEX-NEXT:    vucomiss %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rcx
-; VEX-NEXT:    vmovq %rcx, %xmm0
-; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_2f32_to_2i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_2f32_to_2i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_2f32_to_2i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
-  %cvt = fptoui <2 x float> %shuf to <2 x i64>
-  ret <2 x i64> %cvt
-}
-
-define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
-; SSE-LABEL: fptoui_4f32_to_2i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    subss %xmm2, %xmm1
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttss2si %xmm0, %rdx
-; SSE-NEXT:    ucomiss %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rdx
-; SSE-NEXT:    movq %rdx, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    subss %xmm2, %xmm3
-; SSE-NEXT:    cvttss2si %xmm3, %rax
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttss2si %xmm0, %rcx
-; SSE-NEXT:    ucomiss %xmm2, %xmm0
-; SSE-NEXT:    cmovaeq %rax, %rcx
-; SSE-NEXT:    movq %rcx, %xmm0
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: fptoui_4f32_to_2i64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; VEX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; VEX-NEXT:    vsubss %xmm2, %xmm1, %xmm3
-; VEX-NEXT:    vcvttss2si %xmm3, %rax
-; VEX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttss2si %xmm1, %rdx
-; VEX-NEXT:    vucomiss %xmm2, %xmm1
-; VEX-NEXT:    cmovaeq %rax, %rdx
-; VEX-NEXT:    vsubss %xmm2, %xmm0, %xmm1
-; VEX-NEXT:    vcvttss2si %xmm1, %rax
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttss2si %xmm0, %rcx
-; VEX-NEXT:    vucomiss %xmm2, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rcx
-; VEX-NEXT:    vmovq %rcx, %xmm0
-; VEX-NEXT:    vmovq %rdx, %xmm1
-; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_4f32_to_2i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvttss2usi %xmm1, %rax
-; AVX512F-NEXT:    vcvttss2usi %xmm0, %rcx
-; AVX512F-NEXT:    vmovq %rcx, %xmm0
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_4f32_to_2i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvttss2usi %xmm1, %rax
-; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rcx
-; AVX512VL-NEXT:    vmovq %rcx, %xmm0
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_4f32_to_2i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %ymm0
-; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512VLDQ-NEXT:    vzeroupper
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptoui <4 x float> %a to <4 x i64>
-  %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x i64> %shuf
-}
-
-define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
-; SSE-LABEL: fptoui_8f32_to_8i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    cmpltps %xmm4, %xmm2
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
-; SSE-NEXT:    subps %xmm4, %xmm0
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE-NEXT:    xorps %xmm5, %xmm0
-; SSE-NEXT:    andps %xmm2, %xmm3
-; SSE-NEXT:    andnps %xmm0, %xmm2
-; SSE-NEXT:    orps %xmm3, %xmm2
-; SSE-NEXT:    movaps %xmm1, %xmm3
-; SSE-NEXT:    cmpltps %xmm4, %xmm3
-; SSE-NEXT:    cvttps2dq %xmm1, %xmm0
-; SSE-NEXT:    subps %xmm4, %xmm1
-; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE-NEXT:    xorps %xmm5, %xmm1
-; SSE-NEXT:    andps %xmm3, %xmm0
-; SSE-NEXT:    andnps %xmm1, %xmm3
-; SSE-NEXT:    orps %xmm0, %xmm3
-; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    movaps %xmm3, %xmm1
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptoui_8f32_to_8i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX1-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
-; AVX1-NEXT:    vsubps %ymm1, %ymm0, %ymm1
-; AVX1-NEXT:    vcvttps2dq %ymm1, %ymm1
-; AVX1-NEXT:    vxorps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm0
-; AVX1-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_8f32_to_8i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vsubps %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vxorps %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
-; AVX2-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_8f32_to_8i32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_8f32_to_8i32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttps2udq %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_8f32_to_8i32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT:    vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_8f32_to_8i32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2udq %ymm0, %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptoui <8 x float> %a to <8 x i32>
-  ret <8 x i32> %cvt
-}
-
-define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
-; SSE-LABEL: fptoui_4f32_to_4i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    subss %xmm1, %xmm2
-; SSE-NEXT:    cvttss2si %xmm2, %rcx
-; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm0, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm0
-; SSE-NEXT:    cmovaeq %rcx, %rdx
-; SSE-NEXT:    movq %rdx, %xmm2
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; SSE-NEXT:    movaps %xmm3, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm3, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm3
-; SSE-NEXT:    cmovaeq %rcx, %rdx
-; SSE-NEXT:    movq %rdx, %xmm3
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3]
-; SSE-NEXT:    movaps %xmm3, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm3, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm3
-; SSE-NEXT:    cmovaeq %rcx, %rdx
-; SSE-NEXT:    movq %rdx, %xmm3
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    movaps %xmm0, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    ucomiss %xmm1, %xmm0
-; SSE-NEXT:    cmovaeq %rcx, %rax
-; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptoui_4f32_to_4i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vsubss %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm2, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm2
-; AVX1-NEXT:    cmovaeq %rax, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm2
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX1-NEXT:    vsubss %xmm1, %xmm3, %xmm4
-; AVX1-NEXT:    vcvttss2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm3, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm3
-; AVX1-NEXT:    cmovaeq %rax, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm3
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm0, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm3
-; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vcvttss2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_4f32_to_4i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm2, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm2
-; AVX2-NEXT:    cmovaeq %rax, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm2
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
-; AVX2-NEXT:    vcvttss2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm3, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm3
-; AVX2-NEXT:    cmovaeq %rax, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm3
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm0, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm3
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm4
-; AVX2-NEXT:    vcvttss2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rcx
-; AVX2-NEXT:    vmovq %rcx, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_4f32_to_4i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT:    vcvttss2usi %xmm1, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512F-NEXT:    vcvttss2usi %xmm2, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_4f32_to_4i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT:    vcvttss2usi %xmm1, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512VL-NEXT:    vcvttss2usi %xmm2, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_4f32_to_4i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f32_to_4i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cvt = fptoui <4 x float> %shuf to <4 x i64>
-  ret <4 x i64> %cvt
-}
-
-define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
-; SSE-LABEL: fptoui_8f32_to_4i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    subss %xmm1, %xmm2
-; SSE-NEXT:    cvttss2si %xmm2, %rcx
-; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm0, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm0
-; SSE-NEXT:    cmovaeq %rcx, %rdx
-; SSE-NEXT:    movq %rdx, %xmm2
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; SSE-NEXT:    movaps %xmm3, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm3, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm3
-; SSE-NEXT:    cmovaeq %rcx, %rdx
-; SSE-NEXT:    movq %rdx, %xmm3
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT:    movaps %xmm0, %xmm3
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3]
-; SSE-NEXT:    movaps %xmm3, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm3, %rdx
-; SSE-NEXT:    ucomiss %xmm1, %xmm3
-; SSE-NEXT:    cmovaeq %rcx, %rdx
-; SSE-NEXT:    movq %rdx, %xmm3
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT:    movaps %xmm0, %xmm4
-; SSE-NEXT:    subss %xmm1, %xmm4
-; SSE-NEXT:    cvttss2si %xmm4, %rcx
-; SSE-NEXT:    xorq %rax, %rcx
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    ucomiss %xmm1, %xmm0
-; SSE-NEXT:    cmovaeq %rcx, %rax
-; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptoui_8f32_to_4i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vsubss %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm2, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm2
-; AVX1-NEXT:    cmovaeq %rax, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm2
-; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX1-NEXT:    vsubss %xmm1, %xmm3, %xmm4
-; AVX1-NEXT:    vcvttss2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm3, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm3
-; AVX1-NEXT:    cmovaeq %rax, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm3
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm3
-; AVX1-NEXT:    vcvttss2si %xmm3, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm0, %rdx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm3
-; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm4
-; AVX1-NEXT:    vcvttss2si %xmm4, %rax
-; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX1-NEXT:    vucomiss %xmm1, %xmm0
-; AVX1-NEXT:    cmovaeq %rax, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_8f32_to_4i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
-; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm2, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm2
-; AVX2-NEXT:    cmovaeq %rax, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm2
-; AVX2-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
-; AVX2-NEXT:    vcvttss2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm3, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm3
-; AVX2-NEXT:    cmovaeq %rax, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm3
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm3
-; AVX2-NEXT:    vcvttss2si %xmm3, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm0, %rdx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm3
-; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm4
-; AVX2-NEXT:    vcvttss2si %xmm4, %rax
-; AVX2-NEXT:    xorq %rcx, %rax
-; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
-; AVX2-NEXT:    vucomiss %xmm1, %xmm0
-; AVX2-NEXT:    cmovaeq %rax, %rcx
-; AVX2-NEXT:    vmovq %rcx, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_8f32_to_4i64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvttss2usi %xmm1, %rax
-; AVX512F-NEXT:    vcvttss2usi %xmm0, %rcx
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT:    vcvttss2usi %xmm1, %rdx
-; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT:    vcvttss2usi %xmm0, %rsi
-; AVX512F-NEXT:    vmovq %rsi, %xmm0
-; AVX512F-NEXT:    vmovq %rdx, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vmovq %rcx, %xmm1
-; AVX512F-NEXT:    vmovq %rax, %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_8f32_to_4i64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvttss2usi %xmm1, %rax
-; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rcx
-; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT:    vcvttss2usi %xmm1, %rdx
-; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rsi
-; AVX512VL-NEXT:    vmovq %rsi, %xmm0
-; AVX512VL-NEXT:    vmovq %rdx, %xmm1
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    vmovq %rcx, %xmm1
-; AVX512VL-NEXT:    vmovq %rax, %xmm2
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_8f32_to_4i64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_8f32_to_4i64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
-; AVX512VLDQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptoui <8 x float> %a to <8 x i64>
-  %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i64> %shuf
-}
-
-;
-; Constant Folding
-;
-
-define <2 x i64> @fptosi_2f64_to_2i64_const() {
-; SSE-LABEL: fptosi_2f64_to_2i64_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2f64_to_2i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; AVX-NEXT:    retq
-  %cvt = fptosi <2 x double> <double 1.0, double -1.0> to <2 x i64>
-  ret <2 x i64> %cvt
-}
-
-define <4 x i32> @fptosi_2f64_to_2i32_const() {
-; SSE-LABEL: fptosi_2f64_to_2i32_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = <4294967295,1,u,u>
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2f64_to_2i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u>
-; AVX-NEXT:    retq
-  %cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
-  %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  ret <4 x i32> %ext
-}
-
-define <4 x i64> @fptosi_4f64_to_4i64_const() {
-; SSE-LABEL: fptosi_4f64_to_4i64_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,18446744073709551613]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_4f64_to_4i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613]
-; AVX-NEXT:    retq
-  %cvt = fptosi <4 x double> <double 1.0, double -1.0, double 2.0, double -3.0> to <4 x i64>
-  ret <4 x i64> %cvt
-}
-
-define <4 x i32> @fptosi_4f64_to_4i32_const() {
-; SSE-LABEL: fptosi_4f64_to_4i32_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_4f64_to_4i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
-; AVX-NEXT:    retq
-  %cvt = fptosi <4 x double> <double -1.0, double 1.0, double -2.0, double 3.0> to <4 x i32>
-  ret <4 x i32> %cvt
-}
-
-define <2 x i64> @fptoui_2f64_to_2i64_const() {
-; SSE-LABEL: fptoui_2f64_to_2i64_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptoui_2f64_to_2i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4]
-; AVX-NEXT:    retq
-  %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i64>
-  ret <2 x i64> %cvt
-}
-
-define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_2i32_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = <2,4,u,u>
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptoui_2f64_to_2i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <2,4,u,u>
-; AVX-NEXT:    retq
-  %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
-  %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  ret <4 x i32> %ext
-}
-
-define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) {
-; SSE-LABEL: fptoui_4f64_to_4i64_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4]
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [6,8]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptoui_4f64_to_4i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [2,4,6,8]
-; AVX-NEXT:    retq
-  %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i64>
-  ret <4 x i64> %cvt
-}
-
-define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) {
-; SSE-LABEL: fptoui_4f64_to_4i32_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4,6,8]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptoui_4f64_to_4i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [2,4,6,8]
-; AVX-NEXT:    retq
-  %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i32>
-  ret <4 x i32> %cvt
-}
-
-define <4 x i32> @fptosi_4f32_to_4i32_const() {
-; SSE-LABEL: fptosi_4f32_to_4i32_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_4f32_to_4i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3]
-; AVX-NEXT:    retq
-  %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i32>
-  ret <4 x i32> %cvt
-}
-
-define <4 x i64> @fptosi_4f32_to_4i64_const() {
-; SSE-LABEL: fptosi_4f32_to_4i64_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_4f32_to_4i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3]
-; AVX-NEXT:    retq
-  %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i64>
-  ret <4 x i64> %cvt
-}
-
-define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) {
-; SSE-LABEL: fptosi_8f32_to_8i32_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_8f32_to_8i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295]
-; AVX-NEXT:    retq
-  %cvt = fptosi <8 x float> <float 1.0, float -1.0, float 2.0, float 3.0, float 6.0, float -8.0, float 2.0, float -1.0> to <8 x i32>
-  ret <8 x i32> %cvt
-}
-
-define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) {
-; SSE-LABEL: fptoui_4f32_to_4i32_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,4,6]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptoui_4f32_to_4i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [1,2,4,6]
-; AVX-NEXT:    retq
-  %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 6.0> to <4 x i32>
-  ret <4 x i32> %cvt
-}
-
-define <4 x i64> @fptoui_4f32_to_4i64_const() {
-; SSE-LABEL: fptoui_4f32_to_4i64_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2]
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4,8]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptoui_4f32_to_4i64_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,8]
-; AVX-NEXT:    retq
-  %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 8.0> to <4 x i64>
-  ret <4 x i64> %cvt
-}
-
-define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
-; SSE-LABEL: fptoui_8f32_to_8i32_const:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,2,4,6]
-; SSE-NEXT:    movaps {{.*#+}} xmm1 = [8,6,4,1]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptoui_8f32_to_8i32_const:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1]
-; AVX-NEXT:    retq
-  %cvt = fptoui <8 x float> <float 1.0, float 2.0, float 4.0, float 6.0, float 8.0, float 6.0, float 4.0, float 1.0> to <8 x i32>
-  ret <8 x i32> %cvt
-}
-
-;
-; Special Cases
-;
-
-define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
-; SSE-LABEL: fptosi_2f16_to_4i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    callq __gnu_f2h_ieee
-; SSE-NEXT:    movzwl %ax, %edi
-; SSE-NEXT:    callq __gnu_h2f_ieee
-; SSE-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
-; SSE-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    callq __gnu_f2h_ieee
-; SSE-NEXT:    movzwl %ax, %edi
-; SSE-NEXT:    callq __gnu_h2f_ieee
-; SSE-NEXT:    cvttss2si %xmm0, %eax
-; SSE-NEXT:    cvttss2si (%rsp), %ecx # 4-byte Folded Reload
-; SSE-NEXT:    movd %ecx, %xmm0
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE-NEXT:    popq %rax
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: fptosi_2f16_to_4i32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    pushq %rax
-; VEX-NEXT:    vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; VEX-NEXT:    vmovaps %xmm1, %xmm0
-; VEX-NEXT:    callq __gnu_f2h_ieee
-; VEX-NEXT:    movzwl %ax, %edi
-; VEX-NEXT:    callq __gnu_h2f_ieee
-; VEX-NEXT:    vmovss %xmm0, (%rsp) # 4-byte Spill
-; VEX-NEXT:    vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; VEX-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; VEX-NEXT:    callq __gnu_f2h_ieee
-; VEX-NEXT:    movzwl %ax, %edi
-; VEX-NEXT:    callq __gnu_h2f_ieee
-; VEX-NEXT:    vcvttss2si %xmm0, %eax
-; VEX-NEXT:    vcvttss2si (%rsp), %ecx # 4-byte Folded Reload
-; VEX-NEXT:    vmovd %ecx, %xmm0
-; VEX-NEXT:    vmovd %eax, %xmm1
-; VEX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; VEX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; VEX-NEXT:    popq %rax
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: fptosi_2f16_to_4i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT:    vcvttss2si %xmm0, %eax
-; AVX512-NEXT:    vcvttss2si %xmm1, %ecx
-; AVX512-NEXT:    vmovd %ecx, %xmm0
-; AVX512-NEXT:    vmovd %eax, %xmm1
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512-NEXT:    retq
-  %cvt = fptosi <2 x half> %a to <2 x i32>
-  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %ext
-}
-
-define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind {
-; SSE-LABEL: fptosi_2f80_to_4i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
-; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
-; SSE-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT:    orl $3072, %eax # imm = 0xC00
-; SSE-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    fistpl -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    fnstcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT:    orl $3072, %eax # imm = 0xC00
-; SSE-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    fistpl -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2f80_to_4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    fldt {{[0-9]+}}(%rsp)
-; AVX-NEXT:    fldt {{[0-9]+}}(%rsp)
-; AVX-NEXT:    fisttpl -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    fisttpl -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX-NEXT:    retq
-  %cvt = fptosi <2 x x86_fp80> %a to <2 x i32>
-  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %ext
-}
-
-define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
-; SSE-LABEL: fptosi_2f128_to_4i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq %rcx, %r14
-; SSE-NEXT:    movq %rdx, %rbx
-; SSE-NEXT:    callq __fixtfsi
-; SSE-NEXT:    movl %eax, %ebp
-; SSE-NEXT:    movq %rbx, %rdi
-; SSE-NEXT:    movq %r14, %rsi
-; SSE-NEXT:    callq __fixtfsi
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    movd %ebp, %xmm1
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2f128_to_4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    pushq %rbp
-; AVX-NEXT:    pushq %r14
-; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    movq %rcx, %r14
-; AVX-NEXT:    movq %rdx, %rbx
-; AVX-NEXT:    callq __fixtfsi
-; AVX-NEXT:    movl %eax, %ebp
-; AVX-NEXT:    movq %rbx, %rdi
-; AVX-NEXT:    movq %r14, %rsi
-; AVX-NEXT:    callq __fixtfsi
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vmovd %ebp, %xmm1
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX-NEXT:    popq %rbx
-; AVX-NEXT:    popq %r14
-; AVX-NEXT:    popq %rbp
-; AVX-NEXT:    retq
-  %cvt = fptosi <2 x fp128> %a to <2 x i32>
-  %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x i32> %ext
-}
-
-define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) {
-; SSE-LABEL: fptosi_2f32_to_2i8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2f32_to_2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    retq
-  %cvt = fptosi <2 x float> %a to <2 x i8>
-  ret <2 x i8> %cvt
-}
-
-define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) {
-; SSE-LABEL: fptosi_2f32_to_2i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2f32_to_2i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    retq
-  %cvt = fptosi <2 x float> %a to <2 x i16>
-  ret <2 x i16> %cvt
-}
-
-define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) {
-; SSE-LABEL: fptoui_2f32_to_2i8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptoui_2f32_to_2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    retq
-  %cvt = fptoui <2 x float> %a to <2 x i8>
-  ret <2 x i8> %cvt
-}
-
-define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) {
-; SSE-LABEL: fptoui_2f32_to_2i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptoui_2f32_to_2i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    retq
-  %cvt = fptoui <2 x float> %a to <2 x i16>
-  ret <2 x i16> %cvt
-}
-
-define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) {
-; SSE-LABEL: fptosi_2f64_to_2i8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    andpd {{.*}}(%rip), %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2f64_to_2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    retq
-  %cvt = fptosi <2 x double> %a to <2 x i8>
-  ret <2 x i8> %cvt
-}
-
-define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) {
-; SSE-LABEL: fptosi_2f64_to_2i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptosi_2f64_to_2i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    retq
-  %cvt = fptosi <2 x double> %a to <2 x i16>
-  ret <2 x i16> %cvt
-}
-
-define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_2i8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    andpd {{.*}}(%rip), %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptoui_2f64_to_2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    retq
-  %cvt = fptoui <2 x double> %a to <2 x i8>
-  ret <2 x i8> %cvt
-}
-
-define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_2i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: fptoui_2f64_to_2i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT:    retq
-  %cvt = fptoui <2 x double> %a to <2 x i16>
-  ret <2 x i16> %cvt
-}
-
-define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) {
-; SSE-LABEL: fptosi_8f64_to_8i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttpd2dq %xmm3, %xmm3
-; SSE-NEXT:    cvttpd2dq %xmm2, %xmm2
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT:    cvttpd2dq %xmm1, %xmm1
-; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT:    packssdw %xmm2, %xmm0
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: fptosi_8f64_to_8i16:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; VEX-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; VEX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    vzeroupper
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: fptosi_8f64_to_8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptosi_8f64_to_8i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptosi_8f64_to_8i16:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptosi_8f64_to_8i16:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512VLDQ-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VLDQ-NEXT:    vzeroupper
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptosi <8 x double> %a to <8 x i16>
-  ret <8 x i16> %cvt
-}
-
-define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) {
-; SSE-LABEL: fptoui_8f64_to_8i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttpd2dq %xmm3, %xmm3
-; SSE-NEXT:    cvttpd2dq %xmm2, %xmm2
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT:    cvttpd2dq %xmm1, %xmm1
-; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: fptoui_8f64_to_8i16:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vcvttpd2dq %ymm1, %xmm1
-; VEX-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; VEX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    vzeroupper
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_8f64_to_8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_8f64_to_8i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_8f64_to_8i16:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_8f64_to_8i16:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512VLDQ-NEXT:    vpmovdw %ymm0, %xmm0
-; AVX512VLDQ-NEXT:    vzeroupper
-; AVX512VLDQ-NEXT:    retq
-  %cvt = fptoui <8 x double> %a to <8 x i16>
-  ret <8 x i16> %cvt
-}
-
-define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) {
-; SSE-LABEL: fptosi_16f32_to_16i8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttps2dq %xmm3, %xmm3
-; SSE-NEXT:    cvttps2dq %xmm2, %xmm2
-; SSE-NEXT:    packssdw %xmm3, %xmm2
-; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    packssdw %xmm1, %xmm0
-; SSE-NEXT:    packsswb %xmm2, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptosi_16f32_to_16i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vcvttps2dq %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptosi_16f32_to_16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: fptosi_16f32_to_16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vcvttps2dq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %cvt = fptosi <16 x float> %a to <16 x i8>
-  ret <16 x i8> %cvt
-}
-
-define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) {
-; SSE-LABEL: fptoui_16f32_to_16i8:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvttps2dq %xmm3, %xmm3
-; SSE-NEXT:    cvttps2dq %xmm2, %xmm2
-; SSE-NEXT:    packssdw %xmm3, %xmm2
-; SSE-NEXT:    cvttps2dq %xmm1, %xmm1
-; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    packssdw %xmm1, %xmm0
-; SSE-NEXT:    packuswb %xmm2, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: fptoui_16f32_to_16i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vcvttps2dq %ymm1, %ymm1
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vcvttps2dq %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: fptoui_16f32_to_16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: fptoui_16f32_to_16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vcvttps2dq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %cvt = fptoui <16 x float> %a to <16 x i8>
-  ret <16 x i8> %cvt
-}
-
-define <2 x i64> @fptosi_2f32_to_2i64_load(<2 x float>* %x) {
-; SSE-LABEL: fptosi_2f32_to_2i64_load:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE-NEXT:    cvttss2si %xmm1, %rax
-; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: fptosi_2f32_to_2i64_load:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; VEX-NEXT:    vcvttss2si %xmm0, %rax
-; VEX-NEXT:    vmovq %rax, %xmm1
-; VEX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; VEX-NEXT:    vcvttss2si %xmm0, %rax
-; VEX-NEXT:    vmovq %rax, %xmm0
-; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: fptosi_2f32_to_2i64_load:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptosi_2f32_to_2i64_load:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptosi_2f32_to_2i64_load:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64_load:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2qq (%rdi), %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %a = load <2 x float>, <2 x float>* %x
-  %b = fptosi <2 x float> %a to <2 x i64>
-  ret <2 x i64> %b
-}
-
-define <2 x i64> @fptoui_2f32_to_2i64_load(<2 x float>* %x) {
-; SSE-LABEL: fptoui_2f32_to_2i64_load:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    subss %xmm2, %xmm0
-; SSE-NEXT:    cvttss2si %xmm0, %rax
-; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttss2si %xmm1, %rdx
-; SSE-NEXT:    ucomiss %xmm2, %xmm1
-; SSE-NEXT:    cmovaeq %rax, %rdx
-; SSE-NEXT:    movq %rdx, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE-NEXT:    movaps %xmm1, %xmm3
-; SSE-NEXT:    subss %xmm2, %xmm3
-; SSE-NEXT:    cvttss2si %xmm3, %rax
-; SSE-NEXT:    xorq %rcx, %rax
-; SSE-NEXT:    cvttss2si %xmm1, %rcx
-; SSE-NEXT:    ucomiss %xmm2, %xmm1
-; SSE-NEXT:    cmovaeq %rax, %rcx
-; SSE-NEXT:    movq %rcx, %xmm1
-; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: fptoui_2f32_to_2i64_load:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; VEX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm2
-; VEX-NEXT:    vcvttss2si %xmm2, %rax
-; VEX-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttss2si %xmm0, %rdx
-; VEX-NEXT:    vucomiss %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rdx
-; VEX-NEXT:    vmovq %rdx, %xmm2
-; VEX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm3
-; VEX-NEXT:    vcvttss2si %xmm3, %rax
-; VEX-NEXT:    xorq %rcx, %rax
-; VEX-NEXT:    vcvttss2si %xmm0, %rcx
-; VEX-NEXT:    vucomiss %xmm1, %xmm0
-; VEX-NEXT:    cmovaeq %rax, %rcx
-; VEX-NEXT:    vmovq %rcx, %xmm0
-; VEX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: fptoui_2f32_to_2i64_load:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: fptoui_2f32_to_2i64_load:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm1
-; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT:    vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT:    vmovq %rax, %xmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: fptoui_2f32_to_2i64_load:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64_load:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvttps2uqq (%rdi), %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %a = load <2 x float>, <2 x float>* %x
-  %b = fptoui <2 x float> %a to <2 x i64>
-  ret <2 x i64> %b
-}

Removed: llvm/trunk/test/CodeGen/X86/vec_int_to_fp-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_int_to_fp-widen.ll?rev=368078&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_int_to_fp-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_int_to_fp-widen.ll (removed)
@@ -1,6008 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX1
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX2
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VLDQ
-;
-; 32-bit tests to make sure we're not doing anything stupid.
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1
-
-;
-; Signed Integer to Double
-;
-
-define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
-; SSE2-LABEL: sitofp_2i64_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_2i64_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_2i64_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; VEX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: sitofp_2i64_to_2f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: sitofp_2i64_to_2f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: sitofp_2i64_to_2f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtqq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = sitofp <2 x i64> %a to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
-; SSE-LABEL: sitofp_2i32_to_2f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_2i32_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  %cvt = sitofp <2 x i32> %shuf to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
-; SSE-LABEL: sitofp_4i32_to_2f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4i32_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %cvt = sitofp <4 x i32> %a to <4 x double>
-  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x double> %shuf
-}
-
-define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_2i16_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_2i16_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_2i16_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-  %cvt = sitofp <2 x i16> %shuf to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_8i16_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_8i16_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_8i16_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_8i16_to_2f64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %cvt = sitofp <8 x i16> %a to <8 x double>
-  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x double> %shuf
-}
-
-define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_2i8_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_2i8_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_2i8_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-  %cvt = sitofp <2 x i8> %shuf to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_16i8_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_16i8_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_16i8_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_16i8_to_2f64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %cvt = sitofp <16 x i8> %a to <16 x double>
-  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x double> %shuf
-}
-
-define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
-; SSE2-LABEL: sitofp_4i64_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm3, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_4i64_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
-; SSE41-NEXT:    movq %xmm1, %rax
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_4i64_to_4f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX1-NEXT:    vmovq %xmm1, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_4i64_to_4f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX2-NEXT:    vmovq %xmm1, %rax
-; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: sitofp_4i64_to_4f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: sitofp_4i64_to_4f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: sitofp_4i64_to_4f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: sitofp_4i64_to_4f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtqq2pd %ymm0, %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = sitofp <4 x i64> %a to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
-; SSE-LABEL: sitofp_4i32_to_4f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2pd %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE-NEXT:    cvtdq2pd %xmm0, %xmm1
-; SSE-NEXT:    movaps %xmm2, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4i32_to_4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT:    retq
-  %cvt = sitofp <4 x i32> %a to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_4i16_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_4i16_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4i16_to_4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cvt = sitofp <4 x i16> %shuf to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_8i16_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_8i16_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_8i16_to_4f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_8i16_to_4f64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT:    retq
-  %cvt = sitofp <8 x i16> %a to <8 x double>
-  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x double> %shuf
-}
-
-define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_4i8_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_4i8_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4i8_to_4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cvt = sitofp <4 x i8> %shuf to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_16i8_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_16i8_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_16i8_to_4f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_16i8_to_4f64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT:    retq
-  %cvt = sitofp <16 x i8> %a to <16 x double>
-  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x double> %shuf
-}
-
-;
-; Unsigned Integer to Double
-;
-
-define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
-; SSE2-LABEL: uitofp_2i64_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_2i64_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT:    por {{.*}}(%rip), %xmm1
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    subpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_2i64_to_2f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_2i64_to_2f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_2i64_to_2f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_2i64_to_2f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_2i64_to_2f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtuqq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = uitofp <2 x i64> %a to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
-; SSE2-LABEL: uitofp_2i32_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_2i32_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_2i32_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_2i32_to_2f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_2i32_to_2f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_2i32_to_2f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  %cvt = uitofp <2 x i32> %shuf to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
-; SSE2-LABEL: uitofp_4i32_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_4i32_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_4i32_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_4i32_to_2f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_4i32_to_2f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_4i32_to_2f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = uitofp <4 x i32> %a to <4 x double>
-  %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x double> %shuf
-}
-
-define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_2i16_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_2i16_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_2i16_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-  %cvt = uitofp <2 x i16> %shuf to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_8i16_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_8i16_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_8i16_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: uitofp_8i16_to_2f64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %cvt = uitofp <8 x i16> %a to <8 x double>
-  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x double> %shuf
-}
-
-define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_2i8_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_2i8_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_2i8_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-  %cvt = uitofp <2 x i8> %shuf to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_16i8_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_16i8_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_16i8_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: uitofp_16i8_to_2f64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %cvt = uitofp <16 x i8> %a to <16 x double>
-  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x double> %shuf
-}
-
-define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
-; SSE2-LABEL: uitofp_4i64_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
-; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
-; SSE2-NEXT:    subpd %xmm6, %xmm0
-; SSE2-NEXT:    addpd %xmm3, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psrlq $32, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    subpd %xmm6, %xmm1
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_4i64_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
-; SSE41-NEXT:    por %xmm4, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
-; SSE41-NEXT:    por %xmm5, %xmm0
-; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
-; SSE41-NEXT:    subpd %xmm6, %xmm0
-; SSE41-NEXT:    addpd %xmm3, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT:    por %xmm4, %xmm2
-; SSE41-NEXT:    psrlq $32, %xmm1
-; SSE41-NEXT:    por %xmm5, %xmm1
-; SSE41-NEXT:    subpd %xmm6, %xmm1
-; SSE41-NEXT:    addpd %xmm2, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_4i64_to_4f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_4i64_to_4f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX2-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_4i64_to_4f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX512F-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_4i64_to_4f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
-; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
-; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT:    vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_4i64_to_4f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i64_to_4f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtuqq2pd %ymm0, %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = uitofp <4 x i64> %a to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
-; SSE2-LABEL: uitofp_4i32_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE2-NEXT:    mulpd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm5
-; SSE2-NEXT:    mulpd %xmm2, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    cvtdq2pd %xmm4, %xmm1
-; SSE2-NEXT:    addpd %xmm5, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_4i32_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE41-NEXT:    mulpd %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7]
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm5
-; SSE41-NEXT:    mulpd %xmm2, %xmm5
-; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
-; SSE41-NEXT:    cvtdq2pd %xmm4, %xmm1
-; SSE41-NEXT:    addpd %xmm5, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_4i32_to_4f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_4i32_to_4f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
-; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_4i32_to_4f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_4i32_to_4f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_4i32_to_4f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i32_to_4f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = uitofp <4 x i32> %a to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_4i16_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_4i16_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_4i16_to_4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cvt = uitofp <4 x i16> %shuf to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_8i16_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_8i16_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_8i16_to_4f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: uitofp_8i16_to_4f64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT:    retq
-  %cvt = uitofp <8 x i16> %a to <8 x double>
-  %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x double> %shuf
-}
-
-define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_4i8_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_4i8_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_4i8_to_4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cvt = uitofp <4 x i8> %shuf to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_16i8_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_16i8_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_16i8_to_4f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: uitofp_16i8_to_4f64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT:    retq
-  %cvt = uitofp <16 x i8> %a to <16 x double>
-  %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x double> %shuf
-}
-
-;
-; Signed Integer to Float
-;
-
-define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
-; SSE2-LABEL: sitofp_2i64_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_2i64_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_2i64_to_4f32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: sitofp_2i64_to_4f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: sitofp_2i64_to_4f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: sitofp_2i64_to_4f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = sitofp <2 x i64> %a to <2 x float>
-  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  ret <4 x float> %ext
-}
-
-define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
-; SSE2-LABEL: sitofp_2i64_to_4f32_zero:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_2i64_to_4f32_zero:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero
-; SSE41-NEXT:    movaps %xmm1, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_2i64_to_4f32_zero:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = sitofp <2 x i64> %a to <2 x float>
-  %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %ext
-}
-
-define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
-; SSE2-LABEL: sitofp_4i64_to_4f32_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_4i64_to_4f32_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_4i64_to_4f32_undef:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
-; AVX512VLDQ-NEXT:    vzeroupper
-; AVX512VLDQ-NEXT:    retq
-  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %cvt = sitofp <4 x i64> %ext to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
-; SSE-LABEL: sitofp_4i32_to_4f32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4i32_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %cvt = sitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_4i16_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_4i16_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4i16_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cvt = sitofp <4 x i16> %shuf to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_8i16_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_8i16_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_8i16_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_8i16_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_8i16_to_4f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %cvt = sitofp <8 x i16> %a to <8 x float>
-  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %shuf
-}
-
-define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_4i8_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_4i8_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_4i8_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cvt = sitofp <4 x i8> %shuf to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_16i8_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_16i8_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_16i8_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_16i8_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_16i8_to_4f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %cvt = sitofp <16 x i8> %a to <16 x float>
-  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %shuf
-}
-
-define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
-; SSE2-LABEL: sitofp_4i64_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_4i64_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; SSE41-NEXT:    movq %xmm1, %rax
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_4i64_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_4i64_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: sitofp_4i64_to_4f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: sitofp_4i64_to_4f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: sitofp_4i64_to_4f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
-; AVX512VLDQ-NEXT:    vzeroupper
-; AVX512VLDQ-NEXT:    retq
-  %cvt = sitofp <4 x i64> %a to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
-; SSE-LABEL: sitofp_8i32_to_8f32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_8i32_to_8f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT:    retq
-  %cvt = sitofp <8 x i32> %a to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_8i16_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_8i16_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxwd %xmm0, %xmm1
-; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT:    movaps %xmm2, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_8i16_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_8i16_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_8i16_to_8f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT:    retq
-  %cvt = sitofp <8 x i16> %a to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_8i8_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_8i8_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
-; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT:    movaps %xmm2, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_8i8_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_8i8_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_8i8_to_8f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbd %xmm0, %ymm0
-; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT:    retq
-  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %cvt = sitofp <8 x i8> %shuf to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_16i8_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_16i8_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbd %xmm0, %xmm1
-; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT:    movaps %xmm2, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_16i8_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm1
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_16i8_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_16i8_to_8f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT:    retq
-  %cvt = sitofp <16 x i8> %a to <16 x float>
-  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x float> %shuf
-}
-
-;
-; Unsigned Integer to Float
-;
-
-define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
-; SSE2-LABEL: uitofp_2i64_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB39_1
-; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB39_3
-; SSE2-NEXT:  .LBB39_1:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB39_3:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB39_4
-; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    retq
-; SSE2-NEXT:  .LBB39_4:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_2i64_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB39_1
-; SSE41-NEXT:  # %bb.2:
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    jmp .LBB39_3
-; SSE41-NEXT:  .LBB39_1:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    addss %xmm1, %xmm1
-; SSE41-NEXT:  .LBB39_3:
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB39_4
-; SSE41-NEXT:  # %bb.5:
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-NEXT:    retq
-; SSE41-NEXT:  .LBB39_4:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    addss %xmm0, %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_2i64_to_4f32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB39_1
-; VEX-NEXT:  # %bb.2:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT:    jmp .LBB39_3
-; VEX-NEXT:  .LBB39_1:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; VEX-NEXT:  .LBB39_3:
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB39_4
-; VEX-NEXT:  # %bb.5:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT:    retq
-; VEX-NEXT:  .LBB39_4:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_2i64_to_4f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_2i64_to_4f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_2i64_to_4f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtuqq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = uitofp <2 x i64> %a to <2 x float>
-  %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  ret <4 x float> %ext
-}
-
-define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
-; SSE2-LABEL: uitofp_2i64_to_2f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB40_1
-; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    jmp .LBB40_3
-; SSE2-NEXT:  .LBB40_1:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
-; SSE2-NEXT:  .LBB40_3:
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB40_4
-; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB40_6
-; SSE2-NEXT:  .LBB40_4:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB40_6:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_2i64_to_2f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB40_1
-; SSE41-NEXT:  # %bb.2:
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    jmp .LBB40_3
-; SSE41-NEXT:  .LBB40_1:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    addss %xmm0, %xmm0
-; SSE41-NEXT:  .LBB40_3:
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB40_4
-; SSE41-NEXT:  # %bb.5:
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; SSE41-NEXT:    retq
-; SSE41-NEXT:  .LBB40_4:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    addss %xmm1, %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_2i64_to_2f32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB40_1
-; VEX-NEXT:  # %bb.2:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT:    jmp .LBB40_3
-; VEX-NEXT:  .LBB40_1:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; VEX-NEXT:  .LBB40_3:
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB40_4
-; VEX-NEXT:  # %bb.5:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; VEX-NEXT:    retq
-; VEX-NEXT:  .LBB40_4:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_2i64_to_2f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_2i64_to_2f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_2i64_to_2f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtuqq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = uitofp <2 x i64> %a to <2 x float>
-  %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %ext
-}
-
-define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
-; SSE2-LABEL: uitofp_4i64_to_4f32_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB41_1
-; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    jmp .LBB41_3
-; SSE2-NEXT:  .LBB41_1:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
-; SSE2-NEXT:  .LBB41_3:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB41_4
-; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB41_6
-; SSE2-NEXT:  .LBB41_4:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB41_6:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_4i64_to_4f32_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB41_1
-; SSE41-NEXT:  # %bb.2:
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    jmp .LBB41_3
-; SSE41-NEXT:  .LBB41_1:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    addss %xmm1, %xmm1
-; SSE41-NEXT:  .LBB41_3:
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB41_4
-; SSE41-NEXT:  # %bb.5:
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; SSE41-NEXT:    retq
-; SSE41-NEXT:  .LBB41_4:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    addss %xmm0, %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_4i64_to_4f32_undef:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB41_1
-; VEX-NEXT:  # %bb.2:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT:    jmp .LBB41_3
-; VEX-NEXT:  .LBB41_1:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; VEX-NEXT:  .LBB41_3:
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB41_4
-; VEX-NEXT:  # %bb.5:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT:    retq
-; VEX-NEXT:  .LBB41_4:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
-; AVX512VLDQ-NEXT:    vzeroupper
-; AVX512VLDQ-NEXT:    retq
-  %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-  %cvt = uitofp <4 x i64> %ext to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
-; SSE2-LABEL: uitofp_4i32_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    addps {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    addps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_4i32_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; SSE41-NEXT:    addps {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    addps %xmm1, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_4i32_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_4i32_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_4i32_to_4f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_4i32_to_4f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_4i32_to_4f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = uitofp <4 x i32> %a to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_4i16_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_4i16_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_4i16_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cvt = uitofp <4 x i16> %shuf to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_8i16_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_8i16_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_8i16_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_8i16_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: uitofp_8i16_to_4f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %cvt = uitofp <8 x i16> %a to <8 x float>
-  %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %shuf
-}
-
-define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_4i8_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_4i8_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_4i8_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %cvt = uitofp <4 x i8> %shuf to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_16i8_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_16i8_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_16i8_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_16i8_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: uitofp_16i8_to_4f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %cvt = uitofp <16 x i8> %a to <16 x float>
-  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ret <4 x float> %shuf
-}
-
-define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
-; SSE2-LABEL: uitofp_4i64_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB47_1
-; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    jmp .LBB47_3
-; SSE2-NEXT:  .LBB47_1:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    addss %xmm2, %xmm2
-; SSE2-NEXT:  .LBB47_3:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB47_4
-; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE2-NEXT:    jmp .LBB47_6
-; SSE2-NEXT:  .LBB47_4:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE2-NEXT:    addss %xmm3, %xmm3
-; SSE2-NEXT:  .LBB47_6:
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB47_7
-; SSE2-NEXT:  # %bb.8:
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    jmp .LBB47_9
-; SSE2-NEXT:  .LBB47_7:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
-; SSE2-NEXT:  .LBB47_9:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB47_10
-; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB47_12
-; SSE2-NEXT:  .LBB47_10:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB47_12:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_4i64_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB47_1
-; SSE41-NEXT:  # %bb.2:
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    jmp .LBB47_3
-; SSE41-NEXT:  .LBB47_1:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    addss %xmm2, %xmm2
-; SSE41-NEXT:  .LBB47_3:
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB47_4
-; SSE41-NEXT:  # %bb.5:
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    jmp .LBB47_6
-; SSE41-NEXT:  .LBB47_4:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    addss %xmm0, %xmm0
-; SSE41-NEXT:  .LBB47_6:
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; SSE41-NEXT:    movq %xmm1, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB47_7
-; SSE41-NEXT:  # %bb.8:
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    jmp .LBB47_9
-; SSE41-NEXT:  .LBB47_7:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    addss %xmm2, %xmm2
-; SSE41-NEXT:  .LBB47_9:
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB47_10
-; SSE41-NEXT:  # %bb.11:
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT:    retq
-; SSE41-NEXT:  .LBB47_10:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    addss %xmm1, %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_4i64_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB47_1
-; AVX1-NEXT:  # %bb.2:
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX1-NEXT:    jmp .LBB47_3
-; AVX1-NEXT:  .LBB47_1:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX1-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:  .LBB47_3:
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB47_4
-; AVX1-NEXT:  # %bb.5:
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX1-NEXT:    jmp .LBB47_6
-; AVX1-NEXT:  .LBB47_4:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:  .LBB47_6:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB47_7
-; AVX1-NEXT:  # %bb.8:
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX1-NEXT:    jmp .LBB47_9
-; AVX1-NEXT:  .LBB47_7:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX1-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:  .LBB47_9:
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX1-NEXT:    testq %rax, %rax
-; AVX1-NEXT:    js .LBB47_10
-; AVX1-NEXT:  # %bb.11:
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-; AVX1-NEXT:  .LBB47_10:
-; AVX1-NEXT:    movq %rax, %rcx
-; AVX1-NEXT:    shrq %rcx
-; AVX1-NEXT:    andl $1, %eax
-; AVX1-NEXT:    orq %rcx, %rax
-; AVX1-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_4i64_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB47_1
-; AVX2-NEXT:  # %bb.2:
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX2-NEXT:    jmp .LBB47_3
-; AVX2-NEXT:  .LBB47_1:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX2-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:  .LBB47_3:
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB47_4
-; AVX2-NEXT:  # %bb.5:
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX2-NEXT:    jmp .LBB47_6
-; AVX2-NEXT:  .LBB47_4:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:  .LBB47_6:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB47_7
-; AVX2-NEXT:  # %bb.8:
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX2-NEXT:    jmp .LBB47_9
-; AVX2-NEXT:  .LBB47_7:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX2-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:  .LBB47_9:
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX2-NEXT:    testq %rax, %rax
-; AVX2-NEXT:    js .LBB47_10
-; AVX2-NEXT:  # %bb.11:
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-; AVX2-NEXT:  .LBB47_10:
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    shrq %rcx
-; AVX2-NEXT:    andl $1, %eax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_4i64_to_4f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_4i64_to_4f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_4i64_to_4f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
-; AVX512VLDQ-NEXT:    vzeroupper
-; AVX512VLDQ-NEXT:    retq
-  %cvt = uitofp <4 x i64> %a to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
-; SSE2-LABEL: uitofp_8i32_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
-; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; SSE2-NEXT:    addps %xmm6, %xmm0
-; SSE2-NEXT:    addps %xmm3, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    addps %xmm6, %xmm1
-; SSE2-NEXT:    addps %xmm2, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_8i32_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; SSE41-NEXT:    movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; SSE41-NEXT:    addps %xmm5, %xmm0
-; SSE41-NEXT:    addps %xmm3, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; SSE41-NEXT:    addps %xmm5, %xmm1
-; SSE41-NEXT:    addps %xmm2, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_8i32_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
-; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_8i32_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_8i32_to_8f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_8i32_to_8f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2ps %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_8i32_to_8f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_8i32_to_8f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2ps %ymm0, %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %cvt = uitofp <8 x i32> %a to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_8i16_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_8i16_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT:    movaps %xmm2, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_8i16_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_8i16_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: uitofp_8i16_to_8f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT:    retq
-  %cvt = uitofp <8 x i16> %a to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_8i8_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_8i8_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT:    movaps %xmm2, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_8i8_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_8i8_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: uitofp_8i8_to_8f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT:    retq
-  %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %cvt = uitofp <8 x i8> %shuf to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_16i8_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm2, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_16i8_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT:    movaps %xmm2, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_16i8_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_16i8_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: uitofp_16i8_to_8f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT:    retq
-  %cvt = uitofp <16 x i8> %a to <16 x float>
-  %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x float> %shuf
-}
-
-;
-; Load Signed Integer to Double
-;
-
-define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
-; SSE2-LABEL: sitofp_load_2i64_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_2i64_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_load_2i64_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovdqa (%rdi), %xmm0
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; VEX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtqq2pd (%rdi), %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <2 x i64>, <2 x i64> *%a
-  %cvt = sitofp <2 x i64> %ld to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
-; SSE-LABEL: sitofp_load_2i32_to_2f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2pd (%rdi), %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_2i32_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2pd (%rdi), %xmm0
-; AVX-NEXT:    retq
-  %ld = load <2 x i32>, <2 x i32> *%a
-  %cvt = sitofp <2 x i32> %ld to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) {
-; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps (%rdi), %xmm0
-; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps (%rdi), %xmm0
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %ld = load volatile <4 x i32>, <4 x i32> *%a
-  %b = shufflevector <4 x i32> %ld, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  %cvt = sitofp <2 x i32> %b to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
-; SSE-LABEL: sitofp_load_4i32_to_2f64_2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2pd (%rdi), %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_4i32_to_2f64_2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2pd (%rdi), %xmm0
-; AVX-NEXT:    retq
-  %a = load <4 x i32>, <4 x i32>* %x
-  %b = sitofp <4 x i32> %a to <4 x double>
-  %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x double> %c
-}
-
-define <2 x double> @sitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) {
-; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64_2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movaps (%rdi), %xmm0
-; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64_2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps (%rdi), %xmm0
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %a = load volatile <4 x i32>, <4 x i32>* %x
-  %b = sitofp <4 x i32> %a to <4 x double>
-  %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x double> %c
-}
-
-define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
-; SSE2-LABEL: sitofp_load_2i16_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_2i16_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT:    pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_2i16_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %ld = load <2 x i16>, <2 x i16> *%a
-  %cvt = sitofp <2 x i16> %ld to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
-; SSE2-LABEL: sitofp_load_2i8_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movzwl (%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_2i8_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movzwl (%rdi), %eax
-; SSE41-NEXT:    movd %eax, %xmm0
-; SSE41-NEXT:    pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_2i8_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movzwl (%rdi), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %ld = load <2 x i8>, <2 x i8> *%a
-  %cvt = sitofp <2 x i8> %ld to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
-; SSE2-LABEL: sitofp_load_4i64_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm2
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_4i64_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
-; SSE41-NEXT:    movq %xmm1, %rax
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_load_4i64_to_4f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovdqa (%rdi), %xmm0
-; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; VEX-NEXT:    vpextrq $1, %xmm1, %rax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
-; VEX-NEXT:    vmovq %xmm1, %rax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; VEX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; VEX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtqq2pd (%rdi), %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <4 x i64>, <4 x i64> *%a
-  %cvt = sitofp <4 x i64> %ld to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
-; SSE-LABEL: sitofp_load_4i32_to_4f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm1
-; SSE-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_4i32_to_4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2pd (%rdi), %ymm0
-; AVX-NEXT:    retq
-  %ld = load <4 x i32>, <4 x i32> *%a
-  %cvt = sitofp <4 x i32> %ld to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
-; SSE2-LABEL: sitofp_load_4i16_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_4i16_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxwd (%rdi), %xmm1
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_4i16_to_4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
-; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT:    retq
-  %ld = load <4 x i16>, <4 x i16> *%a
-  %cvt = sitofp <4 x i16> %ld to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
-; SSE2-LABEL: sitofp_load_4i8_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_4i8_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbd (%rdi), %xmm1
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_4i8_to_4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
-; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT:    retq
-  %ld = load <4 x i8>, <4 x i8> *%a
-  %cvt = sitofp <4 x i8> %ld to <4 x double>
-  ret <4 x double> %cvt
-}
-
-;
-; Load Unsigned Integer to Double
-;
-
-define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
-; SSE2-LABEL: uitofp_load_2i64_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    subpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_2i64_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT:    por {{.*}}(%rip), %xmm1
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    por {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    subpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_load_2i64_to_2f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_load_2i64_to_2f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT:    vpsrlq $32, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT:    vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtuqq2pd (%rdi), %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <2 x i64>, <2 x i64> *%a
-  %cvt = uitofp <2 x i64> %ld to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
-; SSE2-LABEL: uitofp_load_2i32_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_2i32_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_load_2i32_to_2f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2pd (%rdi), %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2pd (%rdi), %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <2 x i32>, <2 x i32> *%a
-  %cvt = uitofp <2 x i32> %ld to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
-; SSE2-LABEL: uitofp_load_4i32_to_2f64_2:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_4i32_to_2f64_2:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_load_4i32_to_2f64_2:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovdqa (%rdi), %xmm0
-; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2pd (%rdi), %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2pd (%rdi), %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %a = load <4 x i32>, <4 x i32>* %x
-  %b = uitofp <4 x i32> %a to <4 x double>
-  %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x double> %c
-}
-
-define <2 x double> @uitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) {
-; SSE2-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovdqa (%rdi), %xmm0
-; VEX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; VEX-NEXT:    vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT:    vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT:    vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %a = load volatile <4 x i32>, <4 x i32>* %x
-  %b = uitofp <4 x i32> %a to <4 x double>
-  %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
-  ret <2 x double> %c
-}
-
-define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
-; SSE2-LABEL: uitofp_load_2i16_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_2i16_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_load_2i16_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %ld = load <2 x i16>, <2 x i16> *%a
-  %cvt = uitofp <2 x i16> %ld to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
-; SSE2-LABEL: uitofp_load_2i8_to_2f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movzwl (%rdi), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_2i8_to_2f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movzwl (%rdi), %eax
-; SSE41-NEXT:    movd %eax, %xmm0
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_load_2i8_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movzwl (%rdi), %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
-; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %ld = load <2 x i8>, <2 x i8> *%a
-  %cvt = uitofp <2 x i8> %ld to <2 x double>
-  ret <2 x double> %cvt
-}
-
-define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
-; SSE2-LABEL: uitofp_load_4i64_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    psrlq $32, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
-; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
-; SSE2-NEXT:    subpd %xmm6, %xmm0
-; SSE2-NEXT:    addpd %xmm3, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psrlq $32, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    subpd %xmm6, %xmm1
-; SSE2-NEXT:    addpd %xmm2, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_4i64_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
-; SSE41-NEXT:    por %xmm4, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
-; SSE41-NEXT:    por %xmm5, %xmm0
-; SSE41-NEXT:    movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
-; SSE41-NEXT:    subpd %xmm6, %xmm0
-; SSE41-NEXT:    addpd %xmm3, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT:    por %xmm4, %xmm2
-; SSE41-NEXT:    psrlq $32, %xmm1
-; SSE41-NEXT:    por %xmm5, %xmm1
-; SSE41-NEXT:    subpd %xmm6, %xmm1
-; SSE41-NEXT:    addpd %xmm2, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_load_4i64_to_4f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7]
-; AVX1-NEXT:    vorps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vorpd {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vsubpd {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_load_4i64_to_4f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; AVX2-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX2-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; AVX512F-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX512F-NEXT:    vsubpd %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
-; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
-; AVX512VL-NEXT:    vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT:    vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtuqq2pd (%rdi), %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <4 x i64>, <4 x i64> *%a
-  %cvt = uitofp <4 x i64> %ld to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
-; SSE2-LABEL: uitofp_load_4i32_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE2-NEXT:    mulpd %xmm2, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT:    addpd %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm5
-; SSE2-NEXT:    mulpd %xmm2, %xmm5
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    cvtdq2pd %xmm4, %xmm1
-; SSE2-NEXT:    addpd %xmm5, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_4i32_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE41-NEXT:    mulpd %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm3, %xmm3
-; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7]
-; SSE41-NEXT:    cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT:    addpd %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm4, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm5
-; SSE41-NEXT:    mulpd %xmm2, %xmm5
-; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
-; SSE41-NEXT:    cvtdq2pd %xmm4, %xmm1
-; SSE41-NEXT:    addpd %xmm5, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_load_4i32_to_4f64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX1-NEXT:    vmulpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_load_4i32_to_4f64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
-; AVX2-NEXT:    vmulpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX2-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2pd (%rdi), %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2pd (%rdi), %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <4 x i32>, <4 x i32> *%a
-  %cvt = uitofp <4 x i32> %ld to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
-; SSE2-LABEL: uitofp_load_4i16_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_4i16_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_load_4i16_to_4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT:    retq
-  %ld = load <4 x i16>, <4 x i16> *%a
-  %cvt = uitofp <4 x i16> %ld to <4 x double>
-  ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
-; SSE2-LABEL: uitofp_load_4i8_to_4f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_4i8_to_4f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT:    cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_load_4i8_to_4f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT:    retq
-  %ld = load <4 x i8>, <4 x i8> *%a
-  %cvt = uitofp <4 x i8> %ld to <4 x double>
-  ret <4 x double> %cvt
-}
-
-;
-; Load Signed Integer to Float
-;
-
-define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
-; SSE2-LABEL: sitofp_load_4i64_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_4i64_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; SSE41-NEXT:    movq %xmm1, %rax
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_load_4i64_to_4f32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovdqa (%rdi), %xmm0
-; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; VEX-NEXT:    vmovq %xmm1, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; VEX-NEXT:    vpextrq $1, %xmm1, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtqq2psy (%rdi), %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <4 x i64>, <4 x i64> *%a
-  %cvt = sitofp <4 x i64> %ld to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
-; SSE-LABEL: sitofp_load_4i32_to_4f32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2ps (%rdi), %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_4i32_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2ps (%rdi), %xmm0
-; AVX-NEXT:    retq
-  %ld = load <4 x i32>, <4 x i32> *%a
-  %cvt = sitofp <4 x i32> %ld to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
-; SSE2-LABEL: sitofp_load_4i16_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_4i16_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_4i16_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxwd (%rdi), %xmm0
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %ld = load <4 x i16>, <4 x i16> *%a
-  %cvt = sitofp <4 x i16> %ld to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
-; SSE2-LABEL: sitofp_load_4i8_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_4i8_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_4i8_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovsxbd (%rdi), %xmm0
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %ld = load <4 x i8>, <4 x i8> *%a
-  %cvt = sitofp <4 x i8> %ld to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
-; SSE2-LABEL: sitofp_load_8i64_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm3
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE2-NEXT:    movq %xmm3, %rax
-; SSE2-NEXT:    xorps %xmm4, %xmm4
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_8i64_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE41-NEXT:    movdqa 32(%rdi), %xmm2
-; SSE41-NEXT:    movdqa 48(%rdi), %xmm3
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm4
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
-; SSE41-NEXT:    movq %xmm1, %rax
-; SSE41-NEXT:    xorps %xmm4, %xmm4
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm4
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3]
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT:    pextrq $1, %xmm2, %rax
-; SSE41-NEXT:    xorps %xmm4, %xmm4
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm4
-; SSE41-NEXT:    movq %xmm2, %rax
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
-; SSE41-NEXT:    movq %xmm3, %rax
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; SSE41-NEXT:    pextrq $1, %xmm3, %rax
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: sitofp_load_8i64_to_8f32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovdqa (%rdi), %xmm0
-; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
-; VEX-NEXT:    vmovdqa 32(%rdi), %xmm2
-; VEX-NEXT:    vmovdqa 48(%rdi), %xmm3
-; VEX-NEXT:    vpextrq $1, %xmm2, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
-; VEX-NEXT:    vmovq %xmm2, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
-; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
-; VEX-NEXT:    vmovq %xmm3, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
-; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; VEX-NEXT:    vpextrq $1, %xmm3, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; VEX-NEXT:    vmovq %xmm1, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; VEX-NEXT:    vpextrq $1, %xmm1, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; VEX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
-; AVX512F-NEXT:    vmovq %xmm2, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
-; AVX512F-NEXT:    vmovq %xmm3, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm4, %xmm4
-; AVX512VL-NEXT:    vmovq %xmm2, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
-; AVX512VL-NEXT:    vmovq %xmm3, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vcvtqq2ps (%rdi), %ymm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtqq2ps (%rdi), %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <8 x i64>, <8 x i64> *%a
-  %cvt = sitofp <8 x i64> %ld to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
-; SSE-LABEL: sitofp_load_8i32_to_8f32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2ps (%rdi), %xmm0
-; SSE-NEXT:    cvtdq2ps 16(%rdi), %xmm1
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_load_8i32_to_8f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2ps (%rdi), %ymm0
-; AVX-NEXT:    retq
-  %ld = load <8 x i32>, <8 x i32> *%a
-  %cvt = sitofp <8 x i32> %ld to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
-; SSE2-LABEL: sitofp_load_8i16_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_8i16_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
-; SSE41-NEXT:    pmovsxwd (%rdi), %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_load_8i16_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm0
-; AVX1-NEXT:    vpmovsxwd (%rdi), %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_load_8i16_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxwd (%rdi), %ymm0
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_load_8i16_to_8f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxwd (%rdi), %ymm0
-; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT:    retq
-  %ld = load <8 x i16>, <8 x i16> *%a
-  %cvt = sitofp <8 x i16> %ld to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
-; SSE2-LABEL: sitofp_load_8i8_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    psrad $24, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrad $24, %xmm1
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: sitofp_load_8i8_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovsxbd 4(%rdi), %xmm1
-; SSE41-NEXT:    pmovsxbd (%rdi), %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: sitofp_load_8i8_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovsxbd 4(%rdi), %xmm0
-; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: sitofp_load_8i8_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovsxbd (%rdi), %ymm0
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_load_8i8_to_8f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbd (%rdi), %ymm0
-; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT:    retq
-  %ld = load <8 x i8>, <8 x i8> *%a
-  %cvt = sitofp <8 x i8> %ld to <8 x float>
-  ret <8 x float> %cvt
-}
-
-;
-; Load Unsigned Integer to Float
-;
-
-define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
-; SSE2-LABEL: uitofp_load_4i64_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm2
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB81_1
-; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    jmp .LBB81_3
-; SSE2-NEXT:  .LBB81_1:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
-; SSE2-NEXT:  .LBB81_3:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB81_4
-; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE2-NEXT:    jmp .LBB81_6
-; SSE2-NEXT:  .LBB81_4:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE2-NEXT:    addss %xmm3, %xmm3
-; SSE2-NEXT:  .LBB81_6:
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB81_7
-; SSE2-NEXT:  # %bb.8:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB81_9
-; SSE2-NEXT:  .LBB81_7:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB81_9:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB81_10
-; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    jmp .LBB81_12
-; SSE2-NEXT:  .LBB81_10:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    addss %xmm2, %xmm2
-; SSE2-NEXT:  .LBB81_12:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_4i64_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB81_1
-; SSE41-NEXT:  # %bb.2:
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    jmp .LBB81_3
-; SSE41-NEXT:  .LBB81_1:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    addss %xmm2, %xmm2
-; SSE41-NEXT:  .LBB81_3:
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB81_4
-; SSE41-NEXT:  # %bb.5:
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    jmp .LBB81_6
-; SSE41-NEXT:  .LBB81_4:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    addss %xmm0, %xmm0
-; SSE41-NEXT:  .LBB81_6:
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; SSE41-NEXT:    movq %xmm1, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB81_7
-; SSE41-NEXT:  # %bb.8:
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    jmp .LBB81_9
-; SSE41-NEXT:  .LBB81_7:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    addss %xmm2, %xmm2
-; SSE41-NEXT:  .LBB81_9:
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB81_10
-; SSE41-NEXT:  # %bb.11:
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT:    retq
-; SSE41-NEXT:  .LBB81_10:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    addss %xmm1, %xmm1
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_load_4i64_to_4f32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovdqa (%rdi), %xmm2
-; VEX-NEXT:    vmovdqa 16(%rdi), %xmm0
-; VEX-NEXT:    vpextrq $1, %xmm2, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB81_1
-; VEX-NEXT:  # %bb.2:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT:    jmp .LBB81_3
-; VEX-NEXT:  .LBB81_1:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; VEX-NEXT:  .LBB81_3:
-; VEX-NEXT:    vmovq %xmm2, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB81_4
-; VEX-NEXT:  # %bb.5:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; VEX-NEXT:    jmp .LBB81_6
-; VEX-NEXT:  .LBB81_4:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; VEX-NEXT:  .LBB81_6:
-; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB81_7
-; VEX-NEXT:  # %bb.8:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; VEX-NEXT:    jmp .LBB81_9
-; VEX-NEXT:  .LBB81_7:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm2
-; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; VEX-NEXT:  .LBB81_9:
-; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB81_10
-; VEX-NEXT:  # %bb.11:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VEX-NEXT:    retq
-; VEX-NEXT:  .LBB81_10:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm3, %xmm0
-; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtuqq2psy (%rdi), %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <4 x i64>, <4 x i64> *%a
-  %cvt = uitofp <4 x i64> %ld to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
-; SSE2-LABEL: uitofp_load_4i32_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    por {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    por {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    addps {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    addps %xmm1, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_4i32_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; SSE41-NEXT:    addps {{.*}}(%rip), %xmm0
-; SSE41-NEXT:    addps %xmm1, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_load_4i32_to_4f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; AVX1-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_load_4i32_to_4f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; AVX2-NEXT:    vaddps %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2ps (%rdi), %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2ps (%rdi), %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <4 x i32>, <4 x i32> *%a
-  %cvt = uitofp <4 x i32> %ld to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
-; SSE2-LABEL: uitofp_load_4i16_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_4i16_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_load_4i16_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %ld = load <4 x i16>, <4 x i16> *%a
-  %cvt = uitofp <4 x i16> %ld to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
-; SSE2-LABEL: uitofp_load_4i8_to_4f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_4i8_to_4f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: uitofp_load_4i8_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %ld = load <4 x i8>, <4 x i8> *%a
-  %cvt = uitofp <4 x i8> %ld to <4 x float>
-  ret <4 x float> %cvt
-}
-
-define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
-; SSE2-LABEL: uitofp_load_8i64_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm5
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm0
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm2
-; SSE2-NEXT:    movdqa 48(%rdi), %xmm1
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB85_1
-; SSE2-NEXT:  # %bb.2:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE2-NEXT:    jmp .LBB85_3
-; SSE2-NEXT:  .LBB85_1:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE2-NEXT:    addss %xmm3, %xmm3
-; SSE2-NEXT:  .LBB85_3:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movq %xmm0, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB85_4
-; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
-; SSE2-NEXT:    jmp .LBB85_6
-; SSE2-NEXT:  .LBB85_4:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm4
-; SSE2-NEXT:    addss %xmm4, %xmm4
-; SSE2-NEXT:  .LBB85_6:
-; SSE2-NEXT:    movq %xmm5, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB85_7
-; SSE2-NEXT:  # %bb.8:
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    jmp .LBB85_9
-; SSE2-NEXT:  .LBB85_7:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm0
-; SSE2-NEXT:  .LBB85_9:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
-; SSE2-NEXT:    movq %xmm5, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB85_10
-; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm6
-; SSE2-NEXT:    jmp .LBB85_12
-; SSE2-NEXT:  .LBB85_10:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm6
-; SSE2-NEXT:    addss %xmm6, %xmm6
-; SSE2-NEXT:  .LBB85_12:
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB85_13
-; SSE2-NEXT:  # %bb.14:
-; SSE2-NEXT:    xorps %xmm5, %xmm5
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm5
-; SSE2-NEXT:    jmp .LBB85_15
-; SSE2-NEXT:  .LBB85_13:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm5, %xmm5
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm5
-; SSE2-NEXT:    addss %xmm5, %xmm5
-; SSE2-NEXT:  .LBB85_15:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT:    movq %xmm1, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB85_16
-; SSE2-NEXT:  # %bb.17:
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm7
-; SSE2-NEXT:    jmp .LBB85_18
-; SSE2-NEXT:  .LBB85_16:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm7
-; SSE2-NEXT:    addss %xmm7, %xmm7
-; SSE2-NEXT:  .LBB85_18:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB85_19
-; SSE2-NEXT:  # %bb.20:
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    jmp .LBB85_21
-; SSE2-NEXT:  .LBB85_19:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm1, %xmm1
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE2-NEXT:    addss %xmm1, %xmm1
-; SSE2-NEXT:  .LBB85_21:
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT:    movq %xmm2, %rax
-; SSE2-NEXT:    testq %rax, %rax
-; SSE2-NEXT:    js .LBB85_22
-; SSE2-NEXT:  # %bb.23:
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    jmp .LBB85_24
-; SSE2-NEXT:  .LBB85_22:
-; SSE2-NEXT:    movq %rax, %rcx
-; SSE2-NEXT:    shrq %rcx
-; SSE2-NEXT:    andl $1, %eax
-; SSE2-NEXT:    orq %rcx, %rax
-; SSE2-NEXT:    xorps %xmm2, %xmm2
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE2-NEXT:    addss %xmm2, %xmm2
-; SSE2-NEXT:  .LBB85_24:
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_8i64_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm4
-; SSE41-NEXT:    movdqa 32(%rdi), %xmm1
-; SSE41-NEXT:    movdqa 48(%rdi), %xmm2
-; SSE41-NEXT:    pextrq $1, %xmm0, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB85_1
-; SSE41-NEXT:  # %bb.2:
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE41-NEXT:    jmp .LBB85_3
-; SSE41-NEXT:  .LBB85_1:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE41-NEXT:    addss %xmm3, %xmm3
-; SSE41-NEXT:  .LBB85_3:
-; SSE41-NEXT:    movq %xmm0, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB85_4
-; SSE41-NEXT:  # %bb.5:
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    jmp .LBB85_6
-; SSE41-NEXT:  .LBB85_4:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    addss %xmm0, %xmm0
-; SSE41-NEXT:  .LBB85_6:
-; SSE41-NEXT:    movq %xmm4, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB85_7
-; SSE41-NEXT:  # %bb.8:
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm5
-; SSE41-NEXT:    jmp .LBB85_9
-; SSE41-NEXT:  .LBB85_7:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm5
-; SSE41-NEXT:    addss %xmm5, %xmm5
-; SSE41-NEXT:  .LBB85_9:
-; SSE41-NEXT:    pextrq $1, %xmm4, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB85_10
-; SSE41-NEXT:  # %bb.11:
-; SSE41-NEXT:    xorps %xmm4, %xmm4
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm4
-; SSE41-NEXT:    jmp .LBB85_12
-; SSE41-NEXT:  .LBB85_10:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm4, %xmm4
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm4
-; SSE41-NEXT:    addss %xmm4, %xmm4
-; SSE41-NEXT:  .LBB85_12:
-; SSE41-NEXT:    pextrq $1, %xmm1, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB85_13
-; SSE41-NEXT:  # %bb.14:
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm6
-; SSE41-NEXT:    jmp .LBB85_15
-; SSE41-NEXT:  .LBB85_13:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm6
-; SSE41-NEXT:    addss %xmm6, %xmm6
-; SSE41-NEXT:  .LBB85_15:
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; SSE41-NEXT:    movq %xmm1, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB85_16
-; SSE41-NEXT:  # %bb.17:
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    jmp .LBB85_18
-; SSE41-NEXT:  .LBB85_16:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm1, %xmm1
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm1
-; SSE41-NEXT:    addss %xmm1, %xmm1
-; SSE41-NEXT:  .LBB85_18:
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3]
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0],xmm0[3]
-; SSE41-NEXT:    movq %xmm2, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB85_19
-; SSE41-NEXT:  # %bb.20:
-; SSE41-NEXT:    xorps %xmm3, %xmm3
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE41-NEXT:    jmp .LBB85_21
-; SSE41-NEXT:  .LBB85_19:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm3, %xmm3
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm3
-; SSE41-NEXT:    addss %xmm3, %xmm3
-; SSE41-NEXT:  .LBB85_21:
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; SSE41-NEXT:    pextrq $1, %xmm2, %rax
-; SSE41-NEXT:    testq %rax, %rax
-; SSE41-NEXT:    js .LBB85_22
-; SSE41-NEXT:  # %bb.23:
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; SSE41-NEXT:    retq
-; SSE41-NEXT:  .LBB85_22:
-; SSE41-NEXT:    movq %rax, %rcx
-; SSE41-NEXT:    shrq %rcx
-; SSE41-NEXT:    andl $1, %eax
-; SSE41-NEXT:    orq %rcx, %rax
-; SSE41-NEXT:    xorps %xmm2, %xmm2
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm2
-; SSE41-NEXT:    addss %xmm2, %xmm2
-; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: uitofp_load_8i64_to_8f32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovdqa (%rdi), %xmm1
-; VEX-NEXT:    vmovdqa 16(%rdi), %xmm0
-; VEX-NEXT:    vmovdqa 32(%rdi), %xmm4
-; VEX-NEXT:    vmovdqa 48(%rdi), %xmm3
-; VEX-NEXT:    vpextrq $1, %xmm4, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB85_1
-; VEX-NEXT:  # %bb.2:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; VEX-NEXT:    jmp .LBB85_3
-; VEX-NEXT:  .LBB85_1:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm2
-; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; VEX-NEXT:  .LBB85_3:
-; VEX-NEXT:    vmovq %xmm4, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB85_4
-; VEX-NEXT:  # %bb.5:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm5
-; VEX-NEXT:    jmp .LBB85_6
-; VEX-NEXT:  .LBB85_4:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
-; VEX-NEXT:    vaddss %xmm4, %xmm4, %xmm5
-; VEX-NEXT:  .LBB85_6:
-; VEX-NEXT:    vmovq %xmm3, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB85_7
-; VEX-NEXT:  # %bb.8:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm4
-; VEX-NEXT:    jmp .LBB85_9
-; VEX-NEXT:  .LBB85_7:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm4
-; VEX-NEXT:    vaddss %xmm4, %xmm4, %xmm4
-; VEX-NEXT:  .LBB85_9:
-; VEX-NEXT:    vpextrq $1, %xmm3, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB85_10
-; VEX-NEXT:  # %bb.11:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm3
-; VEX-NEXT:    jmp .LBB85_12
-; VEX-NEXT:  .LBB85_10:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm3
-; VEX-NEXT:    vaddss %xmm3, %xmm3, %xmm3
-; VEX-NEXT:  .LBB85_12:
-; VEX-NEXT:    vpextrq $1, %xmm1, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB85_13
-; VEX-NEXT:  # %bb.14:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm6
-; VEX-NEXT:    jmp .LBB85_15
-; VEX-NEXT:  .LBB85_13:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm6, %xmm6
-; VEX-NEXT:    vaddss %xmm6, %xmm6, %xmm6
-; VEX-NEXT:  .LBB85_15:
-; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3]
-; VEX-NEXT:    vmovq %xmm1, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB85_16
-; VEX-NEXT:  # %bb.17:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm1
-; VEX-NEXT:    jmp .LBB85_18
-; VEX-NEXT:  .LBB85_16:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm1
-; VEX-NEXT:    vaddss %xmm1, %xmm1, %xmm1
-; VEX-NEXT:  .LBB85_18:
-; VEX-NEXT:    vinsertps {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[2,3]
-; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3]
-; VEX-NEXT:    vmovq %xmm0, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB85_19
-; VEX-NEXT:  # %bb.20:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm2
-; VEX-NEXT:    jmp .LBB85_21
-; VEX-NEXT:  .LBB85_19:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm2
-; VEX-NEXT:    vaddss %xmm2, %xmm2, %xmm2
-; VEX-NEXT:  .LBB85_21:
-; VEX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm5[0,1],xmm2[0],xmm5[3]
-; VEX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
-; VEX-NEXT:    vpextrq $1, %xmm0, %rax
-; VEX-NEXT:    testq %rax, %rax
-; VEX-NEXT:    js .LBB85_22
-; VEX-NEXT:  # %bb.23:
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm0
-; VEX-NEXT:    jmp .LBB85_24
-; VEX-NEXT:  .LBB85_22:
-; VEX-NEXT:    movq %rax, %rcx
-; VEX-NEXT:    shrq %rcx
-; VEX-NEXT:    andl $1, %eax
-; VEX-NEXT:    orq %rcx, %rax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm7, %xmm0
-; VEX-NEXT:    vaddss %xmm0, %xmm0, %xmm0
-; VEX-NEXT:  .LBB85_24:
-; VEX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm4
-; AVX512F-NEXT:    vmovq %xmm2, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm2
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
-; AVX512F-NEXT:    vmovq %xmm3, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT:    vmovq %xmm0, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm0
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; AVX512F-NEXT:    vmovq %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm1
-; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm4, %xmm4
-; AVX512VL-NEXT:    vmovq %xmm2, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm2
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
-; AVX512VL-NEXT:    vmovq %xmm3, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT:    vmovq %xmm0, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm0
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; AVX512VL-NEXT:    vmovq %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm1
-; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vcvtuqq2ps (%rdi), %ymm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtuqq2ps (%rdi), %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <8 x i64>, <8 x i64> *%a
-  %cvt = uitofp <8 x i64> %ld to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
-; SSE2-LABEL: uitofp_load_8i32_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm0
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
-; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; SSE2-NEXT:    addps %xmm6, %xmm0
-; SSE2-NEXT:    addps %xmm3, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    por %xmm4, %xmm2
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
-; SSE2-NEXT:    addps %xmm6, %xmm1
-; SSE2-NEXT:    addps %xmm2, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_8i32_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa (%rdi), %xmm0
-; SSE41-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928]
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; SSE41-NEXT:    movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; SSE41-NEXT:    addps %xmm5, %xmm0
-; SSE41-NEXT:    addps %xmm3, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; SSE41-NEXT:    addps %xmm5, %xmm1
-; SSE41-NEXT:    addps %xmm2, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_load_8i32_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vcvtdq2ps %ymm1, %ymm1
-; AVX1-NEXT:    vmulps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_load_8i32_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
-; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; AVX2-NEXT:    vaddps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2ps (%rdi), %ymm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2ps (%rdi), %ymm0
-; AVX512VLDQ-NEXT:    retq
-  %ld = load <8 x i32>, <8 x i32> *%a
-  %cvt = uitofp <8 x i32> %ld to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
-; SSE2-LABEL: uitofp_load_8i16_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_8i16_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_load_8i16_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_load_8i16_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: uitofp_load_8i16_to_8f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT:    retq
-  %ld = load <8 x i16>, <8 x i16> *%a
-  %cvt = uitofp <8 x i16> %ld to <8 x float>
-  ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
-; SSE2-LABEL: uitofp_load_8i8_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: uitofp_load_8i8_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: uitofp_load_8i8_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: uitofp_load_8i8_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: uitofp_load_8i8_to_8f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT:    retq
-  %ld = load <8 x i8>, <8 x i8> *%a
-  %cvt = uitofp <8 x i8> %ld to <8 x float>
-  ret <8 x float> %cvt
-}
-
-;
-; Aggregates
-;
-
-%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
-define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
-; SSE2-LABEL: aggregate_sitofp_8i16_to_8f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movq 24(%rdi), %rax
-; SSE2-NEXT:    movdqu 8(%rdi), %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    psrad $16, %xmm1
-; SSE2-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrad $16, %xmm0
-; SSE2-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT:    movaps %xmm0, 16(%rax)
-; SSE2-NEXT:    movaps %xmm1, (%rax)
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movq 24(%rdi), %rax
-; SSE41-NEXT:    pmovsxwd 16(%rdi), %xmm0
-; SSE41-NEXT:    pmovsxwd 8(%rdi), %xmm1
-; SSE41-NEXT:    cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT:    movaps %xmm0, 16(%rax)
-; SSE41-NEXT:    movaps %xmm1, (%rax)
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    movq 24(%rdi), %rax
-; AVX1-NEXT:    vpmovsxwd 16(%rdi), %xmm0
-; AVX1-NEXT:    vpmovsxwd 8(%rdi), %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps %ymm0, (%rax)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movq 24(%rdi), %rax
-; AVX2-NEXT:    vpmovsxwd 8(%rdi), %ymm0
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, (%rax)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: aggregate_sitofp_8i16_to_8f32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movq 24(%rdi), %rax
-; AVX512-NEXT:    vpmovsxwd 8(%rdi), %ymm0
-; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT:    vmovaps %ymm0, (%rax)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
- %1 = load %Arguments, %Arguments* %a0, align 1
- %2 = extractvalue %Arguments %1, 1
- %3 = extractvalue %Arguments %1, 2
- %4 = sitofp <8 x i16> %2 to <8 x float>
- store <8 x float> %4, <8 x float>* %3, align 32
- ret void
-}
-
-define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind {
-; SSE-LABEL: sitofp_i32_to_2f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtsi2sd %edi, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_i32_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtsi2sd %edi, %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %cvt = sitofp i32 %a1 to double
-  %res = insertelement <2 x double> %a0, double %cvt, i32 0
-  ret <2 x double> %res
-}
-
-define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind {
-; SSE-LABEL: sitofp_i32_to_4f32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtsi2ss %edi, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_i32_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtsi2ss %edi, %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %cvt = sitofp i32 %a1 to float
-  %res = insertelement <4 x float> %a0, float %cvt, i32 0
-  ret <4 x float> %res
-}
-
-define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind {
-; SSE-LABEL: sitofp_i64_to_2f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtsi2sd %rdi, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_i64_to_2f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtsi2sd %rdi, %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %cvt = sitofp i64 %a1 to double
-  %res = insertelement <2 x double> %a0, double %cvt, i32 0
-  ret <2 x double> %res
-}
-
-define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind {
-; SSE-LABEL: sitofp_i64_to_4f32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtsi2ss %rdi, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sitofp_i64_to_4f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtsi2ss %rdi, %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %cvt = sitofp i64 %a1 to float
-  %res = insertelement <4 x float> %a0, float %cvt, i32 0
-  ret <4 x float> %res
-}
-
-; Extract from int vector and convert to FP.
-
-define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract0_sitofp_v4i32_f32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extract0_sitofp_v4i32_f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %e = extractelement <4 x i32> %x, i32 0
-  %r = sitofp i32 %e to float
-  ret float %r
-}
-
-define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE-NEXT:    incl %eax
-; SSE-NEXT:    cvtsi2ss %eax, %xmm1
-; SSE-NEXT:    divss %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    incl %eax
-; AVX-NEXT:    vcvtsi2ss %eax, %xmm1, %xmm1
-; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %e = extractelement <4 x i32> %x, i32 0
-  %f = sitofp i32 %e to float
-  %e1 = add i32 %e, 1
-  %f1 = sitofp i32 %e1 to float
-  %r = fdiv float %f, %f1
-  ret float %r
-}
-
-define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, i32* %p) nounwind {
-; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
-; SSE:       # %bb.0:
-; SSE-NEXT:    cvtdq2ps %xmm0, %xmm1
-; SSE-NEXT:    movss %xmm0, (%rdi)
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm1
-; AVX-NEXT:    vmovss %xmm0, (%rdi)
-; AVX-NEXT:    vmovaps %xmm1, %xmm0
-; AVX-NEXT:    retq
-  %e = extractelement <4 x i32> %x, i32 0
-  %r = sitofp i32 %e to float
-  store i32 %e, i32* %p
-  ret float %r
-}
-
-define double @extract0_sitofp_v4i32_f64(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract0_sitofp_v4i32_f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    cvtsi2sd %eax, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extract0_sitofp_v4i32_f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %e = extractelement <4 x i32> %x, i32 0
-  %r = sitofp i32 %e to double
-  ret double %r
-}
-
-define float @extract0_uitofp_v4i32_f32(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract0_uitofp_v4i32_f32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: extract0_uitofp_v4i32_f32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovd %xmm0, %eax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: extract0_uitofp_v4i32_f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: extract0_uitofp_v4i32_f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: extract0_uitofp_v4i32_f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %e = extractelement <4 x i32> %x, i32 0
-  %r = uitofp i32 %e to float
-  ret float %r
-}
-
-define double @extract0_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract0_uitofp_v4i32_f64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE-NEXT:    retq
-;
-; VEX-LABEL: extract0_uitofp_v4i32_f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vmovd %xmm0, %eax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: extract0_uitofp_v4i32_f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: extract0_uitofp_v4i32_f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: extract0_uitofp_v4i32_f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %e = extractelement <4 x i32> %x, i32 0
-  %r = uitofp i32 %e to double
-  ret double %r
-}
-
-; Extract non-zero element from int vector and convert to FP.
-
-define float @extract3_sitofp_v4i32_f32(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract3_sitofp_v4i32_f32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: extract3_sitofp_v4i32_f32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %e = extractelement <4 x i32> %x, i32 3
-  %r = sitofp i32 %e to float
-  ret float %r
-}
-
-define double @extract3_sitofp_v4i32_f64(<4 x i32> %x) nounwind {
-; SSE2-LABEL: extract3_sitofp_v4i32_f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2sd %eax, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: extract3_sitofp_v4i32_f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    extractps $3, %xmm0, %eax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2sd %eax, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: extract3_sitofp_v4i32_f64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT:    vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT:    retq
-  %e = extractelement <4 x i32> %x, i32 3
-  %r = sitofp i32 %e to double
-  ret double %r
-}
-
-define float @extract3_uitofp_v4i32_f32(<4 x i32> %x) nounwind {
-; SSE2-LABEL: extract3_uitofp_v4i32_f32:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: extract3_uitofp_v4i32_f32:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    extractps $3, %xmm0, %eax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: extract3_uitofp_v4i32_f32:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vextractps $3, %xmm0, %eax
-; VEX-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: extract3_uitofp_v4i32_f32:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: extract3_uitofp_v4i32_f32:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VLDQ-NEXT:    vcvtudq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %e = extractelement <4 x i32> %x, i32 3
-  %r = uitofp i32 %e to float
-  ret float %r
-}
-
-define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
-; SSE2-LABEL: extract3_uitofp_v4i32_f64:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: extract3_uitofp_v4i32_f64:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    extractps $3, %xmm0, %eax
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE41-NEXT:    retq
-;
-; VEX-LABEL: extract3_uitofp_v4i32_f64:
-; VEX:       # %bb.0:
-; VEX-NEXT:    vextractps $3, %xmm0, %eax
-; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm0
-; VEX-NEXT:    retq
-;
-; AVX512F-LABEL: extract3_uitofp_v4i32_f64:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: extract3_uitofp_v4i32_f64:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64:
-; AVX512VLDQ:       # %bb.0:
-; AVX512VLDQ-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    retq
-  %e = extractelement <4 x i32> %x, i32 3
-  %r = uitofp i32 %e to double
-  ret double %r
-}
-

Modified: llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll?rev=368079&r1=368078&r2=368079&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-idiv-v2i32.ll Tue Aug  6 13:12:20 2019
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X64_WIDEN
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X86_WIDEN
 
 define void @test_udiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
 ; X64-LABEL: test_udiv7_v2i32:
@@ -43,45 +41,6 @@ define void @test_udiv7_v2i32(<2 x i32>*
 ; X86-NEXT:    psrld $2, %xmm0
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
-;
-; X64_WIDEN-LABEL: test_udiv7_v2i32:
-; X64_WIDEN:       # %bb.0:
-; X64_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X64_WIDEN-NEXT:    movdqa %xmm0, %xmm2
-; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64_WIDEN-NEXT:    psubd %xmm2, %xmm0
-; X64_WIDEN-NEXT:    psrld $1, %xmm0
-; X64_WIDEN-NEXT:    paddd %xmm2, %xmm0
-; X64_WIDEN-NEXT:    psrld $2, %xmm0
-; X64_WIDEN-NEXT:    movq %xmm0, (%rsi)
-; X64_WIDEN-NEXT:    retq
-;
-; X86_WIDEN-LABEL: test_udiv7_v2i32:
-; X86_WIDEN:       # %bb.0:
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm2
-; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm3
-; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86_WIDEN-NEXT:    psubd %xmm2, %xmm0
-; X86_WIDEN-NEXT:    psrld $1, %xmm0
-; X86_WIDEN-NEXT:    paddd %xmm2, %xmm0
-; X86_WIDEN-NEXT:    psrld $2, %xmm0
-; X86_WIDEN-NEXT:    movq %xmm0, (%eax)
-; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = udiv <2 x i32> %a, <i32 7, i32 7>
   store <2 x i32> %b, <2 x i32>* %y
@@ -137,55 +96,6 @@ define void @test_urem7_v2i32(<2 x i32>*
 ; X86-NEXT:    paddd %xmm0, %xmm1
 ; X86-NEXT:    movq %xmm1, (%eax)
 ; X86-NEXT:    retl
-;
-; X64_WIDEN-LABEL: test_urem7_v2i32:
-; X64_WIDEN:       # %bb.0:
-; X64_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X64_WIDEN-NEXT:    movdqa %xmm0, %xmm2
-; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64_WIDEN-NEXT:    movdqa %xmm0, %xmm1
-; X64_WIDEN-NEXT:    psubd %xmm2, %xmm1
-; X64_WIDEN-NEXT:    psrld $1, %xmm1
-; X64_WIDEN-NEXT:    paddd %xmm2, %xmm1
-; X64_WIDEN-NEXT:    psrld $2, %xmm1
-; X64_WIDEN-NEXT:    movdqa %xmm1, %xmm2
-; X64_WIDEN-NEXT:    pslld $3, %xmm2
-; X64_WIDEN-NEXT:    psubd %xmm2, %xmm1
-; X64_WIDEN-NEXT:    paddd %xmm0, %xmm1
-; X64_WIDEN-NEXT:    movq %xmm1, (%rsi)
-; X64_WIDEN-NEXT:    retq
-;
-; X86_WIDEN-LABEL: test_urem7_v2i32:
-; X86_WIDEN:       # %bb.0:
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm2
-; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm3
-; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm1
-; X86_WIDEN-NEXT:    psubd %xmm2, %xmm1
-; X86_WIDEN-NEXT:    psrld $1, %xmm1
-; X86_WIDEN-NEXT:    paddd %xmm2, %xmm1
-; X86_WIDEN-NEXT:    psrld $2, %xmm1
-; X86_WIDEN-NEXT:    movdqa %xmm1, %xmm2
-; X86_WIDEN-NEXT:    pslld $3, %xmm2
-; X86_WIDEN-NEXT:    psubd %xmm2, %xmm1
-; X86_WIDEN-NEXT:    paddd %xmm0, %xmm1
-; X86_WIDEN-NEXT:    movq %xmm1, (%eax)
-; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = urem <2 x i32> %a, <i32 7, i32 7>
   store <2 x i32> %b, <2 x i32>* %y
@@ -243,57 +153,6 @@ define void @test_sdiv7_v2i32(<2 x i32>*
 ; X86-NEXT:    paddd %xmm0, %xmm2
 ; X86-NEXT:    movq %xmm2, (%eax)
 ; X86-NEXT:    retl
-;
-; X64_WIDEN-LABEL: test_sdiv7_v2i32:
-; X64_WIDEN:       # %bb.0:
-; X64_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X64_WIDEN-NEXT:    movdqa %xmm0, %xmm2
-; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X64_WIDEN-NEXT:    pxor %xmm3, %xmm3
-; X64_WIDEN-NEXT:    pcmpgtd %xmm0, %xmm3
-; X64_WIDEN-NEXT:    pand %xmm1, %xmm3
-; X64_WIDEN-NEXT:    paddd %xmm0, %xmm3
-; X64_WIDEN-NEXT:    psubd %xmm3, %xmm2
-; X64_WIDEN-NEXT:    paddd %xmm0, %xmm2
-; X64_WIDEN-NEXT:    movdqa %xmm2, %xmm0
-; X64_WIDEN-NEXT:    psrld $31, %xmm0
-; X64_WIDEN-NEXT:    psrad $2, %xmm2
-; X64_WIDEN-NEXT:    paddd %xmm0, %xmm2
-; X64_WIDEN-NEXT:    movq %xmm2, (%rsi)
-; X64_WIDEN-NEXT:    retq
-;
-; X86_WIDEN-LABEL: test_sdiv7_v2i32:
-; X86_WIDEN:       # %bb.0:
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm2
-; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm3
-; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X86_WIDEN-NEXT:    pxor %xmm3, %xmm3
-; X86_WIDEN-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86_WIDEN-NEXT:    pand %xmm1, %xmm3
-; X86_WIDEN-NEXT:    paddd %xmm0, %xmm3
-; X86_WIDEN-NEXT:    psubd %xmm3, %xmm2
-; X86_WIDEN-NEXT:    paddd %xmm0, %xmm2
-; X86_WIDEN-NEXT:    movdqa %xmm2, %xmm0
-; X86_WIDEN-NEXT:    psrld $31, %xmm0
-; X86_WIDEN-NEXT:    psrad $2, %xmm2
-; X86_WIDEN-NEXT:    paddd %xmm0, %xmm2
-; X86_WIDEN-NEXT:    movq %xmm2, (%eax)
-; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = sdiv <2 x i32> %a, <i32 7, i32 7>
   store <2 x i32> %b, <2 x i32>* %y
@@ -359,65 +218,6 @@ define void @test_srem7_v2i32(<2 x i32>*
 ; X86-NEXT:    paddd %xmm0, %xmm2
 ; X86-NEXT:    movq %xmm2, (%eax)
 ; X86-NEXT:    retl
-;
-; X64_WIDEN-LABEL: test_srem7_v2i32:
-; X64_WIDEN:       # %bb.0:
-; X64_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X64_WIDEN-NEXT:    movdqa %xmm0, %xmm2
-; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
-; X64_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X64_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X64_WIDEN-NEXT:    pxor %xmm3, %xmm3
-; X64_WIDEN-NEXT:    pcmpgtd %xmm0, %xmm3
-; X64_WIDEN-NEXT:    pand %xmm1, %xmm3
-; X64_WIDEN-NEXT:    paddd %xmm0, %xmm3
-; X64_WIDEN-NEXT:    psubd %xmm3, %xmm2
-; X64_WIDEN-NEXT:    paddd %xmm0, %xmm2
-; X64_WIDEN-NEXT:    movdqa %xmm2, %xmm1
-; X64_WIDEN-NEXT:    psrld $31, %xmm1
-; X64_WIDEN-NEXT:    psrad $2, %xmm2
-; X64_WIDEN-NEXT:    paddd %xmm1, %xmm2
-; X64_WIDEN-NEXT:    movdqa %xmm2, %xmm1
-; X64_WIDEN-NEXT:    pslld $3, %xmm1
-; X64_WIDEN-NEXT:    psubd %xmm1, %xmm2
-; X64_WIDEN-NEXT:    paddd %xmm0, %xmm2
-; X64_WIDEN-NEXT:    movq %xmm2, (%rsi)
-; X64_WIDEN-NEXT:    retq
-;
-; X86_WIDEN-LABEL: test_srem7_v2i32:
-; X86_WIDEN:       # %bb.0:
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86_WIDEN-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm2
-; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm2
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86_WIDEN-NEXT:    movdqa %xmm0, %xmm3
-; X86_WIDEN-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86_WIDEN-NEXT:    pmuludq %xmm1, %xmm3
-; X86_WIDEN-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X86_WIDEN-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X86_WIDEN-NEXT:    pxor %xmm3, %xmm3
-; X86_WIDEN-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86_WIDEN-NEXT:    pand %xmm1, %xmm3
-; X86_WIDEN-NEXT:    paddd %xmm0, %xmm3
-; X86_WIDEN-NEXT:    psubd %xmm3, %xmm2
-; X86_WIDEN-NEXT:    paddd %xmm0, %xmm2
-; X86_WIDEN-NEXT:    movdqa %xmm2, %xmm1
-; X86_WIDEN-NEXT:    psrld $31, %xmm1
-; X86_WIDEN-NEXT:    psrad $2, %xmm2
-; X86_WIDEN-NEXT:    paddd %xmm1, %xmm2
-; X86_WIDEN-NEXT:    movdqa %xmm2, %xmm1
-; X86_WIDEN-NEXT:    pslld $3, %xmm1
-; X86_WIDEN-NEXT:    psubd %xmm1, %xmm2
-; X86_WIDEN-NEXT:    paddd %xmm0, %xmm2
-; X86_WIDEN-NEXT:    movq %xmm2, (%eax)
-; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = srem <2 x i32> %a, <i32 7, i32 7>
   store <2 x i32> %b, <2 x i32>* %y
@@ -440,22 +240,6 @@ define void @test_udiv_pow2_v2i32(<2 x i
 ; X86-NEXT:    psrld $3, %xmm0
 ; X86-NEXT:    movq %xmm0, (%eax)
 ; X86-NEXT:    retl
-;
-; X64_WIDEN-LABEL: test_udiv_pow2_v2i32:
-; X64_WIDEN:       # %bb.0:
-; X64_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT:    psrld $3, %xmm0
-; X64_WIDEN-NEXT:    movq %xmm0, (%rsi)
-; X64_WIDEN-NEXT:    retq
-;
-; X86_WIDEN-LABEL: test_udiv_pow2_v2i32:
-; X86_WIDEN:       # %bb.0:
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; X86_WIDEN-NEXT:    psrld $3, %xmm0
-; X86_WIDEN-NEXT:    movq %xmm0, (%eax)
-; X86_WIDEN-NEXT:    retl
   %a = load <2 x i32>, <2 x i32>* %x
   %b = udiv <2 x i32> %a, <i32 8, i32 8>
   store <2 x i32> %b, <2 x i32>* %y

Added: llvm/trunk/test/CodeGen/X86/vector-lzcnt-sub128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-sub128.ll?rev=368079&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-sub128.ll (added)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-sub128.ll Tue Aug  6 13:12:20 2019
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1 immarg)
+
+define <2 x i32> @illegal_ctlz(<2 x i32> %v1) {
+; CHECK-LABEL: illegal_ctlz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrld $1, %xmm1
+; CHECK-NEXT:    por %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    psrld $2, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrld $4, %xmm1
+; CHECK-NEXT:    por %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    psrld $8, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrld $16, %xmm1
+; CHECK-NEXT:    por %xmm0, %xmm1
+; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
+; CHECK-NEXT:    pxor %xmm1, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    psrlw $1, %xmm0
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    psubb %xmm0, %xmm2
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT:    movdqa %xmm2, %xmm1
+; CHECK-NEXT:    pand %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $2, %xmm2
+; CHECK-NEXT:    pand %xmm0, %xmm2
+; CHECK-NEXT:    paddb %xmm1, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    psrlw $4, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; CHECK-NEXT:    psadbw %xmm1, %xmm2
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    psadbw %xmm1, %xmm0
+; CHECK-NEXT:    packuswb %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %v2 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %v1, i1 true)
+  ret <2 x i32> %v2
+}
+
+declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1 immarg)
+
+define <2 x i32> @illegal_cttz(<2 x i32> %v1) {
+; CHECK-LABEL: illegal_cttz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    paddd %xmm0, %xmm1
+; CHECK-NEXT:    pandn %xmm1, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $1, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    psubb %xmm1, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; CHECK-NEXT:    movdqa %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm1, %xmm2
+; CHECK-NEXT:    psrlw $2, %xmm0
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    paddb %xmm2, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    psrlw $4, %xmm1
+; CHECK-NEXT:    paddb %xmm0, %xmm1
+; CHECK-NEXT:    pand {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; CHECK-NEXT:    psadbw %xmm0, %xmm2
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    psadbw %xmm0, %xmm1
+; CHECK-NEXT:    packuswb %xmm2, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %v2 = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %v1, i1 true)
+  ret <2 x i32> %v2
+}

Removed: llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll?rev=368078&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-add-widen.ll (removed)
@@ -1,1386 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
-
-;
-; vXi64
-;
-
-define i64 @test_v2i64(<2 x i64> %a0) {
-; SSE-LABEL: test_v2i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    paddq %xmm0, %xmm1
-; SSE-NEXT:    movq %xmm1, %rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: test_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_v2i64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a0)
-  ret i64 %1
-}
-
-define i64 @test_v4i64(<4 x i64> %a0) {
-; SSE-LABEL: test_v4i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    paddq %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    paddq %xmm0, %xmm1
-; SSE-NEXT:    movq %xmm1, %rax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v4i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v4i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v4i64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %a0)
-  ret i64 %1
-}
-
-define i64 @test_v8i64(<8 x i64> %a0) {
-; SSE-LABEL: test_v8i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    paddq %xmm3, %xmm1
-; SSE-NEXT:    paddq %xmm2, %xmm1
-; SSE-NEXT:    paddq %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT:    paddq %xmm1, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v8i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v8i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v8i64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %a0)
-  ret i64 %1
-}
-
-define i64 @test_v16i64(<16 x i64> %a0) {
-; SSE-LABEL: test_v16i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    paddq %xmm6, %xmm2
-; SSE-NEXT:    paddq %xmm7, %xmm3
-; SSE-NEXT:    paddq %xmm5, %xmm3
-; SSE-NEXT:    paddq %xmm1, %xmm3
-; SSE-NEXT:    paddq %xmm4, %xmm2
-; SSE-NEXT:    paddq %xmm3, %xmm2
-; SSE-NEXT:    paddq %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    paddq %xmm2, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v16i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v16i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v16i64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %a0)
-  ret i64 %1
-}
-
-;
-; vXi32
-;
-
-define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE-LABEL: test_v2i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    retq
-;
-; AVX1-SLOW-LABEL: test_v2i32:
-; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT:    retq
-;
-; AVX1-FAST-LABEL: test_v2i32:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-LABEL: test_v2i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v2i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0)
-  ret i32 %1
-}
-
-define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE-LABEL: test_v4i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    paddd %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    retq
-;
-; AVX1-SLOW-LABEL: test_v4i32:
-; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT:    retq
-;
-; AVX1-FAST-LABEL: test_v4i32:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-LABEL: test_v4i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v4i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a0)
-  ret i32 %1
-}
-
-define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE-LABEL: test_v8i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    paddd %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    paddd %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    retq
-;
-; AVX1-SLOW-LABEL: test_v8i32:
-; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT:    vzeroupper
-; AVX1-SLOW-NEXT:    retq
-;
-; AVX1-FAST-LABEL: test_v8i32:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm1, %xmm0
-; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX1-FAST-NEXT:    vzeroupper
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-LABEL: test_v8i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v8i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a0)
-  ret i32 %1
-}
-
-define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE-LABEL: test_v16i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    paddd %xmm3, %xmm1
-; SSE-NEXT:    paddd %xmm2, %xmm1
-; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT:    paddd %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    retq
-;
-; AVX1-SLOW-LABEL: test_v16i32:
-; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT:    vzeroupper
-; AVX1-SLOW-NEXT:    retq
-;
-; AVX1-FAST-LABEL: test_v16i32:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX1-FAST-NEXT:    vzeroupper
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-LABEL: test_v16i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v16i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a0)
-  ret i32 %1
-}
-
-define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE-LABEL: test_v32i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    paddd %xmm6, %xmm2
-; SSE-NEXT:    paddd %xmm7, %xmm3
-; SSE-NEXT:    paddd %xmm5, %xmm3
-; SSE-NEXT:    paddd %xmm1, %xmm3
-; SSE-NEXT:    paddd %xmm4, %xmm2
-; SSE-NEXT:    paddd %xmm3, %xmm2
-; SSE-NEXT:    paddd %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    paddd %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    paddd %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    retq
-;
-; AVX1-SLOW-LABEL: test_v32i32:
-; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-SLOW-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT:    vzeroupper
-; AVX1-SLOW-NEXT:    retq
-;
-; AVX1-FAST-LABEL: test_v32i32:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm4
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-FAST-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX1-FAST-NEXT:    vzeroupper
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-LABEL: test_v32i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v32i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %a0)
-  ret i32 %1
-}
-
-;
-; vXi16
-;
-
-define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE-LABEL: test_v2i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    paddw %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX1-SLOW-LABEL: test_v2i16:
-; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT:    retq
-;
-; AVX1-FAST-LABEL: test_v2i16:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-LABEL: test_v2i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v2i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> %a0)
-  ret i16 %1
-}
-
-define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    paddw %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    paddw %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX1-SLOW-LABEL: test_v4i16:
-; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT:    retq
-;
-; AVX1-FAST-LABEL: test_v4i16:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-LABEL: test_v4i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v4i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a0)
-  ret i16 %1
-}
-
-define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE-LABEL: test_v8i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    paddw %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    paddw %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    paddw %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX1-SLOW-LABEL: test_v8i16:
-; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT:    retq
-;
-; AVX1-FAST-LABEL: test_v8i16:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-LABEL: test_v8i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v8i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a0)
-  ret i16 %1
-}
-
-define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE-LABEL: test_v16i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    paddw %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    paddw %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    paddw %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    paddw %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX1-SLOW-LABEL: test_v16i16:
-; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT:    vzeroupper
-; AVX1-SLOW-NEXT:    retq
-;
-; AVX1-FAST-LABEL: test_v16i16:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm1, %xmm0
-; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT:    vzeroupper
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-LABEL: test_v16i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v16i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %a0)
-  ret i16 %1
-}
-
-define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE-LABEL: test_v32i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    paddw %xmm3, %xmm1
-; SSE-NEXT:    paddw %xmm2, %xmm1
-; SSE-NEXT:    paddw %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT:    paddw %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    paddw %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    paddw %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX1-SLOW-LABEL: test_v32i16:
-; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT:    vzeroupper
-; AVX1-SLOW-NEXT:    retq
-;
-; AVX1-FAST-LABEL: test_v32i16:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT:    vzeroupper
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-LABEL: test_v32i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v32i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %a0)
-  ret i16 %1
-}
-
-define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE-LABEL: test_v64i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    paddw %xmm6, %xmm2
-; SSE-NEXT:    paddw %xmm7, %xmm3
-; SSE-NEXT:    paddw %xmm5, %xmm3
-; SSE-NEXT:    paddw %xmm1, %xmm3
-; SSE-NEXT:    paddw %xmm4, %xmm2
-; SSE-NEXT:    paddw %xmm3, %xmm2
-; SSE-NEXT:    paddw %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    paddw %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    paddw %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    paddw %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX1-SLOW-LABEL: test_v64i16:
-; AVX1-SLOW:       # %bb.0:
-; AVX1-SLOW-NEXT:    vpaddw %xmm3, %xmm1, %xmm4
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT:    vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT:    vzeroupper
-; AVX1-SLOW-NEXT:    retq
-;
-; AVX1-FAST-LABEL: test_v64i16:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vpaddw %xmm3, %xmm1, %xmm4
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm3, %xmm3
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-FAST-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT:    vpaddw %xmm4, %xmm2, %xmm2
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vmovd %xmm0, %eax
-; AVX1-FAST-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT:    vzeroupper
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-LABEL: test_v64i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v64i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %a0)
-  ret i16 %1
-}
-
-;
-; vXi8
-;
-
-define i8 @test_v2i8(<2 x i8> %a0) {
-; SSE2-LABEL: test_v2i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v2i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    paddb %xmm0, %xmm1
-; SSE41-NEXT:    pextrb $0, %xmm1, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_v2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_v2i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v4i8(<4 x i8> %a0) {
-; SSE2-LABEL: test_v4i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $8, %xmm0
-; SSE2-NEXT:    paddb %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v4i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    paddb %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    paddb %xmm1, %xmm0
-; SSE41-NEXT:    pextrb $0, %xmm0, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_v4i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v8i8(<8 x i8> %a0) {
-; SSE2-LABEL: test_v8i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    paddb %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT:    paddb %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    paddb %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    paddb %xmm0, %xmm1
-; SSE41-NEXT:    pextrb $0, %xmm1, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_v8i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_v8i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    paddb %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm1, %xmm0
-; SSE41-NEXT:    pextrb $0, %xmm0, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_v16i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v32i8(<32 x i8> %a0) {
-; SSE2-LABEL: test_v32i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddb %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    psadbw %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v32i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddb %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    paddb %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    psadbw %xmm1, %xmm0
-; SSE41-NEXT:    pextrb $0, %xmm0, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: test_v32i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX1-NEXT:    # kill: def $al killed $al killed $eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v32i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX2-NEXT:    # kill: def $al killed $al killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v32i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v64i8(<64 x i8> %a0) {
-; SSE2-LABEL: test_v64i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddb %xmm3, %xmm1
-; SSE2-NEXT:    paddb %xmm2, %xmm1
-; SSE2-NEXT:    paddb %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    paddb %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v64i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddb %xmm3, %xmm1
-; SSE41-NEXT:    paddb %xmm2, %xmm1
-; SSE41-NEXT:    paddb %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT:    paddb %xmm1, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    psadbw %xmm0, %xmm1
-; SSE41-NEXT:    pextrb $0, %xmm1, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: test_v64i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX1-NEXT:    # kill: def $al killed $al killed $eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v64i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX2-NEXT:    # kill: def $al killed $al killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v64i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v128i8(<128 x i8> %a0) {
-; SSE2-LABEL: test_v128i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    paddb %xmm7, %xmm3
-; SSE2-NEXT:    paddb %xmm5, %xmm3
-; SSE2-NEXT:    paddb %xmm1, %xmm3
-; SSE2-NEXT:    paddb %xmm6, %xmm2
-; SSE2-NEXT:    paddb %xmm4, %xmm2
-; SSE2-NEXT:    paddb %xmm3, %xmm2
-; SSE2-NEXT:    paddb %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    paddb %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    psadbw %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v128i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    paddb %xmm7, %xmm3
-; SSE41-NEXT:    paddb %xmm5, %xmm3
-; SSE41-NEXT:    paddb %xmm1, %xmm3
-; SSE41-NEXT:    paddb %xmm6, %xmm2
-; SSE41-NEXT:    paddb %xmm4, %xmm2
-; SSE41-NEXT:    paddb %xmm3, %xmm2
-; SSE41-NEXT:    paddb %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT:    paddb %xmm2, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    psadbw %xmm0, %xmm1
-; SSE41-NEXT:    pextrb $0, %xmm1, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: test_v128i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpaddb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX1-NEXT:    # kill: def $al killed $al killed $eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v128i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX2-NEXT:    # kill: def $al killed $al killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v128i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %a0)
-  ret i8 %1
-}
-
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)

Removed: llvm/trunk/test/CodeGen/X86/vector-reduce-and-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-reduce-and-widen.ll?rev=368078&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-reduce-and-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-reduce-and-widen.ll (removed)
@@ -1,1168 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
-
-;
-; vXi64
-;
-
-define i64 @test_v2i64(<2 x i64> %a0) {
-; SSE-LABEL: test_v2i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movq %xmm1, %rax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: test_v2i64:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovq %xmm0, %rax
-; AVX-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0)
-  ret i64 %1
-}
-
-define i64 @test_v4i64(<4 x i64> %a0) {
-; SSE-LABEL: test_v4i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movq %xmm1, %rax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v4i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v4i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v4i64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0)
-  ret i64 %1
-}
-
-define i64 @test_v8i64(<8 x i64> %a0) {
-; SSE-LABEL: test_v8i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v8i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v8i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v8i64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0)
-  ret i64 %1
-}
-
-define i64 @test_v16i64(<16 x i64> %a0) {
-; SSE-LABEL: test_v16i64:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v16i64:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v16i64:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v16i64:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovq %xmm0, %rax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0)
-  ret i64 %1
-}
-
-;
-; vXi32
-;
-
-define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE-LABEL: test_v2i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: test_v2i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0)
-  ret i32 %1
-}
-
-define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE-LABEL: test_v4i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: test_v4i32:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0)
-  ret i32 %1
-}
-
-define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE-LABEL: test_v8i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v8i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v8i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v8i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0)
-  ret i32 %1
-}
-
-define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE-LABEL: test_v16i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v16i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v16i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v16i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0)
-  ret i32 %1
-}
-
-define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE-LABEL: test_v32i32:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v32i32:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v32i32:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v32i32:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0)
-  ret i32 %1
-}
-
-;
-; vXi16
-;
-
-define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE-LABEL: test_v2i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: test_v2i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0)
-  ret i16 %1
-}
-
-define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: test_v4i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0)
-  ret i16 %1
-}
-
-define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE-LABEL: test_v8i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: test_v8i16:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0)
-  ret i16 %1
-}
-
-define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE-LABEL: test_v16i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v16i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v16i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v16i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0)
-  ret i16 %1
-}
-
-define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE-LABEL: test_v32i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v32i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v32i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v32i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0)
-  ret i16 %1
-}
-
-define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE-LABEL: test_v64i16:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SSE-NEXT:    retq
-;
-; AVX1-LABEL: test_v64i16:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v64i16:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v64i16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vmovd %xmm0, %eax
-; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0)
-  ret i16 %1
-}
-
-;
-; vXi8
-;
-
-define i8 @test_v2i8(<2 x i8> %a0) {
-; SSE2-LABEL: test_v2i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v2i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pextrb $0, %xmm1, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_v2i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v4i8(<4 x i8> %a0) {
-; SSE2-LABEL: test_v4i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $8, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v4i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    pextrb $0, %xmm0, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_v4i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v8i8(<8 x i8> %a0) {
-; SSE2-LABEL: test_v8i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v8i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pextrb $0, %xmm1, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_v8i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $8, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v16i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    pextrb $0, %xmm0, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_v16i8:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX-NEXT:    # kill: def $al killed $al killed $eax
-; AVX-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v32i8(<32 x i8> %a0) {
-; SSE2-LABEL: test_v32i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrlw $8, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v32i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrld $16, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrlw $8, %xmm0
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    pextrb $0, %xmm0, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: test_v32i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX1-NEXT:    # kill: def $al killed $al killed $eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v32i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX2-NEXT:    # kill: def $al killed $al killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v32i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v64i8(<64 x i8> %a0) {
-; SSE2-LABEL: test_v64i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pand %xmm3, %xmm1
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v64i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pand %xmm3, %xmm1
-; SSE41-NEXT:    pand %xmm2, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pextrb $0, %xmm1, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: test_v64i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX1-NEXT:    # kill: def $al killed $al killed $eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v64i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX2-NEXT:    # kill: def $al killed $al killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v64i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0)
-  ret i8 %1
-}
-
-define i8 @test_v128i8(<128 x i8> %a0) {
-; SSE2-LABEL: test_v128i8:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pand %xmm6, %xmm2
-; SSE2-NEXT:    pand %xmm7, %xmm3
-; SSE2-NEXT:    pand %xmm5, %xmm3
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    pand %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    # kill: def $al killed $al killed $eax
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: test_v128i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pand %xmm6, %xmm2
-; SSE41-NEXT:    pand %xmm7, %xmm3
-; SSE41-NEXT:    pand %xmm5, %xmm3
-; SSE41-NEXT:    pand %xmm1, %xmm3
-; SSE41-NEXT:    pand %xmm4, %xmm2
-; SSE41-NEXT:    pand %xmm3, %xmm2
-; SSE41-NEXT:    pand %xmm0, %xmm2
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT:    pand %xmm2, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    psrld $16, %xmm0
-; SSE41-NEXT:    pand %xmm1, %xmm0
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    psrlw $8, %xmm1
-; SSE41-NEXT:    pand %xmm0, %xmm1
-; SSE41-NEXT:    pextrb $0, %xmm1, %eax
-; SSE41-NEXT:    # kill: def $al killed $al killed $eax
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: test_v128i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX1-NEXT:    # kill: def $al killed $al killed $eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v128i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX2-NEXT:    # kill: def $al killed $al killed $eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_v128i8:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512-NEXT:    # kill: def $al killed $al killed $eax
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0)
-  ret i8 %1
-}
-
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>)




More information about the llvm-commits mailing list