[llvm] r311091 - [AVX512] Don't switch unmasked subvector insert/extract instructions when AVX512DQI is enabled.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 17 08:40:25 PDT 2017


Author: ctopper
Date: Thu Aug 17 08:40:25 2017
New Revision: 311091

URL: http://llvm.org/viewvc/llvm-project?rev=311091&view=rev
Log:
[AVX512] Don't switch unmasked subvector insert/extract instructions when AVX512DQI is enabled.

There's no reason to switch instructions with and without DQI. It just creates extra isel patterns and test divergences.

There is however value in enabling the masked version of the instructions with DQI.

This required introducing some new multiclasses to enabling this splitting.

Differential Revision: https://reviews.llvm.org/D36661

Modified:
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
    llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/trunk/test/CodeGen/X86/avx512-trunc.ll
    llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/compress_expand.ll
    llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
    llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512.ll
    llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll
    llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll
    llvm/trunk/test/CodeGen/X86/vector-compare-results.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
    llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Thu Aug 17 08:40:25 2017
@@ -285,6 +285,28 @@ multiclass AVX512_maskable_fp_common<bit
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
 // perserved vector elements come from a new dummy input operand tied to $dst.
+// This version uses a separate dag for non-masking and masking.
+multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
+                           dag Outs, dag Ins, string OpcodeStr,
+                           string AttSrcAsm, string IntelSrcAsm,
+                           dag RHS, dag MaskRHS,
+                           InstrItinClass itin = NoItinerary,
+                           bit IsCommutable = 0, bit IsKCommutable = 0,
+                           SDNode Select = vselect> :
+   AVX512_maskable_custom<O, F, Outs, Ins,
+                          !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+                          !con((ins _.KRCWM:$mask), Ins),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm,
+                          [(set _.RC:$dst, RHS)],
+                          [(set _.RC:$dst,
+                              (Select _.KRCWM:$mask, MaskRHS, _.RC:$src0))],
+                          [(set _.RC:$dst,
+                              (Select _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
+                          "$src0 = $dst", itin, IsCommutable, IsKCommutable>;
+
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction.  In the masking case, the
+// perserved vector elements come from a new dummy input operand tied to $dst.
 multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
@@ -512,28 +534,45 @@ let isReMaterializable = 1, isAsCheapAsA
 //===----------------------------------------------------------------------===//
 // AVX-512 - VECTOR INSERT
 //
-multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
-                                                       PatFrag vinsert_insert> {
+
+// Supports two different pattern operators for mask and unmasked ops. Allows
+// null_frag to be passed for one.
+multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
+                                  X86VectorVTInfo To,
+                                  SDPatternOperator vinsert_insert,
+                                  SDPatternOperator vinsert_for_mask> {
   let ExeDomain = To.ExeDomain in {
-    defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
+    defm rr : AVX512_maskable_split<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
                    (ins To.RC:$src1, From.RC:$src2, u8imm:$src3),
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
                                          (From.VT From.RC:$src2),
-                                         (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
+                                         (iPTR imm)),
+                   (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
+                                           (From.VT From.RC:$src2),
+                                           (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
 
-    defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
+    defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
                    (ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
                                (From.VT (bitconvert (From.LdFrag addr:$src2))),
+                               (iPTR imm)),
+                   (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
+                               (From.VT (bitconvert (From.LdFrag addr:$src2))),
                                (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
                    EVEX_CD8<From.EltSize, From.CD8TupleForm>;
   }
 }
 
+// Passes the same pattern operator for masked and unmasked ops.
+multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From,
+                            X86VectorVTInfo To,
+                            SDPatternOperator vinsert_insert> :
+  vinsert_for_size_split<Opcode, From, To, vinsert_insert, vinsert_insert>;
+
 multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
                        X86VectorVTInfo To, PatFrag vinsert_insert,
                        SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
@@ -573,22 +612,24 @@ multiclass vinsert_for_type<ValueType El
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
                                  vinsert256_insert>, VEX_W, EVEX_V512;
 
+  // Even with DQI we'd like to only use these instructions for masking.
   let Predicates = [HasVLX, HasDQI] in
-    defm NAME # "64x2Z256" : vinsert_for_size<Opcode128,
+    defm NAME # "64x2Z256" : vinsert_for_size_split<Opcode128,
                                    X86VectorVTInfo< 2, EltVT64, VR128X>,
                                    X86VectorVTInfo< 4, EltVT64, VR256X>,
-                                   vinsert128_insert>, VEX_W, EVEX_V256;
+                                   null_frag, vinsert128_insert>, VEX_W, EVEX_V256;
 
+  // Even with DQI we'd like to only use these instructions for masking.
   let Predicates = [HasDQI] in {
-    defm NAME # "64x2Z" : vinsert_for_size<Opcode128,
+    defm NAME # "64x2Z" : vinsert_for_size_split<Opcode128,
                                  X86VectorVTInfo< 2, EltVT64, VR128X>,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
-                                 vinsert128_insert>, VEX_W, EVEX_V512;
+                                 null_frag, vinsert128_insert>, VEX_W, EVEX_V512;
 
-    defm NAME # "32x8Z" : vinsert_for_size<Opcode256,
+    defm NAME # "32x8Z" : vinsert_for_size_split<Opcode256,
                                    X86VectorVTInfo< 8, EltVT32, VR256X>,
                                    X86VectorVTInfo<16, EltVT32, VR512>,
-                                   vinsert256_insert>, EVEX_V512;
+                                   null_frag, vinsert256_insert>, EVEX_V512;
   }
 }
 
@@ -596,21 +637,21 @@ defm VINSERTF : vinsert_for_type<f32, 0x
 defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
 
 // Codegen pattern with the alternative types,
-// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+// Even with AVX512DQ we'll still use these for unmasked operations.
 defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
-              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
-              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
 
 defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
-              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
-              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
 
 defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
-              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
-              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
 
 // Codegen pattern with the alternative types insert VEC128 into VEC256
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
@@ -647,16 +688,20 @@ def VINSERTPSZrm: AVX512AIi8<0x21, MRMSr
 // AVX-512 VECTOR EXTRACT
 //---
 
-multiclass vextract_for_size<int Opcode,
-                             X86VectorVTInfo From, X86VectorVTInfo To,
-                             PatFrag vextract_extract> {
+// Supports two different pattern operators for mask and unmasked ops. Allows
+// null_frag to be passed for one.
+multiclass vextract_for_size_split<int Opcode,
+                                   X86VectorVTInfo From, X86VectorVTInfo To,
+                                   SDPatternOperator vextract_extract,
+                                   SDPatternOperator vextract_for_mask> {
 
   let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
-    defm rr : AVX512_maskable<Opcode, MRMDestReg, To, (outs To.RC:$dst),
+    defm rr : AVX512_maskable_split<Opcode, MRMDestReg, To, (outs To.RC:$dst),
                 (ins From.RC:$src1, u8imm:$idx),
                 "vextract" # To.EltTypeName # "x" # To.NumElts,
                 "$idx, $src1", "$src1, $idx",
-                (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm))>,
+                (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)),
+                (vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm))>,
               AVX512AIi8Base, EVEX;
     def mr  : AVX512AIi8<Opcode, MRMDestMem, (outs),
                     (ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx),
@@ -677,6 +722,12 @@ multiclass vextract_for_size<int Opcode,
   }
 }
 
+// Passes the same pattern operator for masked and unmasked ops.
+multiclass vextract_for_size<int Opcode, X86VectorVTInfo From,
+                             X86VectorVTInfo To,
+                             SDPatternOperator vextract_extract> :
+  vextract_for_size_split<Opcode, From, To, vextract_extract, vextract_extract>;
+
 // Codegen pattern for the alternative types
 multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
                 X86VectorVTInfo To, PatFrag vextract_extract,
@@ -713,22 +764,26 @@ multiclass vextract_for_type<ValueType E
                                  X86VectorVTInfo< 4, EltVT32, VR128X>,
                                  vextract128_extract>,
                                      EVEX_V256, EVEX_CD8<32, CD8VT4>;
+
+  // Even with DQI we'd like to only use these instructions for masking.
   let Predicates = [HasVLX, HasDQI] in
-    defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
+    defm NAME # "64x2Z256" : vextract_for_size_split<Opcode128,
                                  X86VectorVTInfo< 4, EltVT64, VR256X>,
                                  X86VectorVTInfo< 2, EltVT64, VR128X>,
-                                 vextract128_extract>,
+                                 null_frag, vextract128_extract>,
                                      VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+
+  // Even with DQI we'd like to only use these instructions for masking.
   let Predicates = [HasDQI] in {
-    defm NAME # "64x2Z" : vextract_for_size<Opcode128,
+    defm NAME # "64x2Z" : vextract_for_size_split<Opcode128,
                                  X86VectorVTInfo< 8, EltVT64, VR512>,
                                  X86VectorVTInfo< 2, EltVT64, VR128X>,
-                                 vextract128_extract>,
+                                 null_frag, vextract128_extract>,
                                      VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
-    defm NAME # "32x8Z" : vextract_for_size<Opcode256,
+    defm NAME # "32x8Z" : vextract_for_size_split<Opcode256,
                                  X86VectorVTInfo<16, EltVT32, VR512>,
                                  X86VectorVTInfo< 8, EltVT32, VR256X>,
-                                 vextract256_extract>,
+                                 null_frag, vextract256_extract>,
                                      EVEX_V512, EVEX_CD8<32, CD8VT8>;
   }
 }
@@ -737,21 +792,21 @@ defm VEXTRACTF : vextract_for_type<f32,
 defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;
 
 // extract_subvector codegen patterns with the alternative types.
-// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+// Even with AVX512DQ we'll still use these for unmasked operations.
 defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
-          vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
-          vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
 
 defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
-          vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+          vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
-          vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+          vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 
 defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
-          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
-          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
 
 // Codegen pattern with the alternative types extract VEC128 from VEC256
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,

Modified: llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-cvt.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-cvt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-cvt.ll Thu Aug 17 08:40:25 2017
@@ -642,19 +642,12 @@ define <4 x i32> @fptosi03(<4 x double>
 }
 
 define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
-; NODQ-LABEL: fptrunc00:
-; NODQ:       # BB#0:
-; NODQ-NEXT:    vcvtpd2ps %zmm0, %ymm0
-; NODQ-NEXT:    vcvtpd2ps %zmm1, %ymm1
-; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; NODQ-NEXT:    retq
-;
-; DQ-LABEL: fptrunc00:
-; DQ:       # BB#0:
-; DQ-NEXT:    vcvtpd2ps %zmm0, %ymm0
-; DQ-NEXT:    vcvtpd2ps %zmm1, %ymm1
-; DQ-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
-; DQ-NEXT:    retq
+; ALL-LABEL: fptrunc00:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvtpd2ps %zmm0, %ymm0
+; ALL-NEXT:    vcvtpd2ps %zmm1, %ymm1
+; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    retq
   %a = fptrunc <16 x double> %b to <16 x float>
   ret <16 x float> %a
 }
@@ -876,21 +869,13 @@ define i32 @float_to_int(float %x) {
 }
 
 define <16 x double> @uitof64(<16 x i32> %a) nounwind {
-; NODQ-LABEL: uitof64:
-; NODQ:       # BB#0:
-; NODQ-NEXT:    vcvtudq2pd %ymm0, %zmm2
-; NODQ-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; NODQ-NEXT:    vcvtudq2pd %ymm0, %zmm1
-; NODQ-NEXT:    vmovaps %zmm2, %zmm0
-; NODQ-NEXT:    retq
-;
-; DQ-LABEL: uitof64:
-; DQ:       # BB#0:
-; DQ-NEXT:    vcvtudq2pd %ymm0, %zmm2
-; DQ-NEXT:    vextractf32x8 $1, %zmm0, %ymm0
-; DQ-NEXT:    vcvtudq2pd %ymm0, %zmm1
-; DQ-NEXT:    vmovaps %zmm2, %zmm0
-; DQ-NEXT:    retq
+; ALL-LABEL: uitof64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvtudq2pd %ymm0, %zmm2
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT:    vcvtudq2pd %ymm0, %zmm1
+; ALL-NEXT:    vmovaps %zmm2, %zmm0
+; ALL-NEXT:    retq
   %b = uitofp <16 x i32> %a to <16 x double>
   ret <16 x double> %b
 }

Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Thu Aug 17 08:40:25 2017
@@ -20,25 +20,15 @@ define <16 x float> @test1(<16 x float>
 }
 
 define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
-; KNL-LABEL: test2:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; KNL-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
-; KNL-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
-; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; KNL-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
-; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
-;
-; SKX-LABEL: test2:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; SKX-NEXT:    vinsertf64x2 $0, %xmm2, %zmm0, %zmm2
-; SKX-NEXT:    vextractf64x2 $3, %zmm0, %xmm0
-; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SKX-NEXT:    vinsertf64x2 $3, %xmm0, %zmm2, %zmm0
-; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
+; CHECK-LABEL: test2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
+; CHECK-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
+; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; CHECK-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %rrr = load double, double* %br
   %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
   %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@@ -59,23 +49,14 @@ define <16 x float> @test3(<16 x float>
 }
 
 define <8 x i64> @test4(<8 x i64> %x) nounwind {
-; KNL-LABEL: test4:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; KNL-NEXT:    vmovq %xmm1, %rax
-; KNL-NEXT:    vpinsrq $1, %rax, %xmm0, %xmm1
-; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
-;
-; SKX-LABEL: test4:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vextracti64x2 $2, %zmm0, %xmm1
-; SKX-NEXT:    vmovq %xmm1, %rax
-; SKX-NEXT:    vpinsrq $1, %rax, %xmm0, %xmm1
-; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
+; CHECK-LABEL: test4:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; CHECK-NEXT:    vmovq %xmm1, %rax
+; CHECK-NEXT:    vpinsrq $1, %rax, %xmm0, %xmm1
+; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+; CHECK-NEXT:    ## -- End function
   %eee = extractelement <8 x i64> %x, i32 4
   %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
   ret <8 x i64> %rrr2
@@ -477,7 +458,7 @@ define i64 @extract_v8i64(<8 x i64> %x,
 ; SKX-LABEL: extract_v8i64:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpextrq $1, %xmm0, %rax
-; SKX-NEXT:    vextracti64x2 $1, %zmm0, %xmm0
+; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
 ; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi)
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -693,23 +674,14 @@ define i8 @extract_v16i8(<16 x i8> %x, i
 }
 
 define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
-; KNL-LABEL: insert_v8i64:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; KNL-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
-; KNL-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
-; KNL-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: insert_v8i64:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm1
-; SKX-NEXT:    vextracti64x2 $1, %zmm0, %xmm0
-; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
-; SKX-NEXT:    vinserti64x2 $1, %xmm0, %zmm1, %zmm0
-; SKX-NEXT:    retq
+; CHECK-LABEL: insert_v8i64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
+; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; CHECK-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
+; CHECK-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
+; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %val = load i64, i64* %ptr
   %r1 = insertelement <8 x i64> %x, i64 %val, i32 1
   %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3
@@ -888,17 +860,11 @@ define <16 x i8> @insert_v16i8(<16 x i8>
 }
 
 define <8 x i64> @test_insert_128_v8i64(<8 x i64> %x, i64 %y) {
-; KNL-LABEL: test_insert_128_v8i64:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
-; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: test_insert_128_v8i64:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
-; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT:    retq
+; CHECK-LABEL: test_insert_128_v8i64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
+; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %r = insertelement <8 x i64> %x, i64 %y, i32 1
   ret <8 x i64> %r
 }
@@ -914,17 +880,11 @@ define <16 x i32> @test_insert_128_v16i3
 }
 
 define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
-; KNL-LABEL: test_insert_128_v8f64:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
-; KNL-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: test_insert_128_v8f64:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
-; SKX-NEXT:    vinsertf64x2 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT:    retq
+; CHECK-LABEL: test_insert_128_v8f64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %r = insertelement <8 x double> %x, double %y, i32 1
   ret <8 x double> %r
 }

Modified: llvm/trunk/test/CodeGen/X86/avx512-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-trunc.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-trunc.ll Thu Aug 17 08:40:25 2017
@@ -726,27 +726,16 @@ define <8 x i8> @usat_trunc_wb_128(<8 x
 }
 
 define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
-; KNL-LABEL: usat_trunc_qw_1024:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm2
-; KNL-NEXT:    vpminuq %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vpminuq %zmm2, %zmm0, %zmm0
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vpmovqd %zmm1, %ymm1
-; KNL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_qw_1024:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm2
-; SKX-NEXT:    vpminuq %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vpminuq %zmm2, %zmm0, %zmm0
-; SKX-NEXT:    vpmovqd %zmm0, %ymm0
-; SKX-NEXT:    vpmovqd %zmm1, %ymm1
-; SKX-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT:    vpmovdw %zmm0, %ymm0
-; SKX-NEXT:    retq
+; ALL-LABEL: usat_trunc_qw_1024:
+; ALL:       ## BB#0:
+; ALL-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; ALL-NEXT:    vpminuq %zmm2, %zmm1, %zmm1
+; ALL-NEXT:    vpminuq %zmm2, %zmm0, %zmm0
+; ALL-NEXT:    vpmovqd %zmm0, %ymm0
+; ALL-NEXT:    vpmovqd %zmm1, %ymm1
+; ALL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    vpmovdw %zmm0, %ymm0
+; ALL-NEXT:    retq
   %x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
   %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
   %x6 = trunc <16 x i64> %x5 to <16 x i16>

Modified: llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll Thu Aug 17 08:40:25 2017
@@ -6,7 +6,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm0
+; CHECK-NEXT:    vextractf32x4 $1, %zmm0, %xmm0
 ; CHECK-NEXT:    kmovw %edi, %k0
 ; CHECK-NEXT:    kshiftlb $7, %k0, %k1
 ; CHECK-NEXT:    kshiftrb $7, %k1, %k1
@@ -36,7 +36,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1
@@ -56,7 +56,7 @@ declare <16 x float> @llvm.x86.avx512.ma
 define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm3
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
@@ -76,7 +76,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm3
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
@@ -96,7 +96,7 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm3
+; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
@@ -116,7 +116,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm3
+; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
@@ -162,7 +162,7 @@ define <16 x float>@test_int_x86_avx512_
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; CHECK-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm2
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm1
@@ -231,7 +231,7 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; CHECK-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm2
+; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm1

Modified: llvm/trunk/test/CodeGen/X86/compress_expand.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/compress_expand.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/compress_expand.ll (original)
+++ llvm/trunk/test/CodeGen/X86/compress_expand.ll Thu Aug 17 08:40:25 2017
@@ -334,7 +334,7 @@ define <32 x float> @test15(float* %base
 define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %trigger) {
 ; SKX-LABEL: test16:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vextracti32x8 $1, %zmm2, %ymm3
+; SKX-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
 ; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1
 ; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k2

Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Thu Aug 17 08:40:25 2017
@@ -422,10 +422,10 @@ define <16 x i32> @test8(<16 x i32*> %pt
 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k3}
 ; SKX-NEXT:    kmovw %k1, %k3
 ; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k3}
-; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm4
+; SKX-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm4
 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
 ; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
-; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
 ; SKX-NEXT:    vpaddd %zmm0, %zmm4, %zmm0
 ; SKX-NEXT:    retq
 ;
@@ -750,7 +750,7 @@ define <16 x float> @test14(float* %base
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k2}
 ; SKX-NEXT:    vgatherqps (,%zmm0), %ymm2 {%k1}
-; SKX-NEXT:    vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
+; SKX-NEXT:    vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test14:
@@ -1686,11 +1686,11 @@ define <16 x i32> @test_gather_16i32(<16
 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm2
+; SKX-NEXT:    vextracti64x4 $1, %zmm3, %ymm2
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX-NEXT:    vpgatherqd (,%zmm1), %ymm2 {%k2}
 ; SKX-NEXT:    vpgatherqd (,%zmm0), %ymm3 {%k1}
-; SKX-NEXT:    vinserti32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test_gather_16i32:
@@ -1772,7 +1772,7 @@ define <16 x i64> @test_gather_16i64(<16
 ; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
 ; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX_32-NEXT:    vpgatherdq (,%ymm0), %zmm2 {%k1}
-; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; SKX_32-NEXT:    vpgatherdq (,%ymm0), %zmm1 {%k2}
 ; SKX_32-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; SKX_32-NEXT:    movl %ebp, %esp
@@ -1809,11 +1809,11 @@ define <16 x float> @test_gather_16f32(<
 ; SKX-NEXT:    vpmovsxbd %xmm2, %zmm2
 ; SKX-NEXT:    vpslld $31, %zmm2, %zmm2
 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm2
+; SKX-NEXT:    vextractf64x4 $1, %zmm3, %ymm2
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX-NEXT:    vgatherqps (,%zmm1), %ymm2 {%k2}
 ; SKX-NEXT:    vgatherqps (,%zmm0), %ymm3 {%k1}
-; SKX-NEXT:    vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
+; SKX-NEXT:    vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test_gather_16f32:
@@ -1895,7 +1895,7 @@ define <16 x double> @test_gather_16f64(
 ; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
 ; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX_32-NEXT:    vgatherdpd (,%ymm0), %zmm2 {%k1}
-; SKX_32-NEXT:    vextractf32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; SKX_32-NEXT:    vgatherdpd (,%ymm0), %zmm1 {%k2}
 ; SKX_32-NEXT:    vmovapd %zmm2, %zmm0
 ; SKX_32-NEXT:    movl %ebp, %esp
@@ -1934,7 +1934,7 @@ define void @test_scatter_16i32(<16 x i3
 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
-; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm0
+; SKX-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
 ; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -2016,7 +2016,7 @@ define void @test_scatter_16i64(<16 x i6
 ; SKX_32-NEXT:    vmovdqa64 8(%ebp), %zmm1
 ; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX_32-NEXT:    vpscatterdq %zmm2, (,%ymm0) {%k1}
-; SKX_32-NEXT:    vextracti32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; SKX_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
 ; SKX_32-NEXT:    movl %ebp, %esp
 ; SKX_32-NEXT:    popl %ebp
@@ -2055,7 +2055,7 @@ define void @test_scatter_16f32(<16 x fl
 ; SKX-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
-; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm0
+; SKX-NEXT:    vextractf64x4 $1, %zmm3, %ymm0
 ; SKX-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
@@ -2138,7 +2138,7 @@ define void @test_scatter_16f64(<16 x do
 ; SKX_32-NEXT:    vmovapd 8(%ebp), %zmm1
 ; SKX_32-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX_32-NEXT:    vscatterdpd %zmm2, (,%ymm0) {%k1}
-; SKX_32-NEXT:    vextractf32x8 $1, %zmm0, %ymm0
+; SKX_32-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; SKX_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
 ; SKX_32-NEXT:    movl %ebp, %esp
 ; SKX_32-NEXT:    popl %ebp

Modified: llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-fp-avx512.ll Thu Aug 17 08:40:25 2017
@@ -458,7 +458,7 @@ define <4 x float> @stack_fold_extracti3
 
 define <2 x double> @stack_fold_extractf64x2(<8 x double> %a0, <8 x double> %a1) {
   ;CHECK-LABEL: stack_fold_extractf64x2
-  ;CHECK:       vextractf64x2 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+  ;CHECK:       vextractf32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
   %1 = shufflevector <8 x double> %a0, <8 x double> %a1, <2 x i32> <i32 6, i32 7>
   %2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <2 x double> %1
@@ -466,7 +466,7 @@ define <2 x double> @stack_fold_extractf
 
 define <8 x float> @stack_fold_extracti32x8(<16 x float> %a0, <16 x float> %a1) {
   ;CHECK-LABEL: stack_fold_extracti32x8
-  ;CHECK:       vextractf32x8 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
+  ;CHECK:       vextractf64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
   %1 = shufflevector <16 x float> %a0, <16 x float> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   ret <8 x float> %1
@@ -482,7 +482,7 @@ define <4 x double> @stack_fold_extractf
 
 define <16 x float> @stack_fold_insertf32x8(<8 x float> %a0, <8 x float> %a1) {
   ;CHECK-LABEL: stack_fold_insertf32x8
-  ;CHECK:       vinsertf32x8 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  ;CHECK:       vinsertf64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x float> %2

Modified: llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/stack-folding-int-avx512.ll Thu Aug 17 08:40:25 2017
@@ -130,7 +130,7 @@ define <4 x i32> @stack_fold_extracti32x
 
 define <2 x i64> @stack_fold_extracti64x2(<8 x i64> %a0, <8 x i64> %a1) {
   ;CHECK-LABEL: stack_fold_extracti64x2
-  ;CHECK:       vextracti64x2 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+  ;CHECK:       vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
   ; add forces execution domain
   %1 = add <8 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
   %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
@@ -140,7 +140,7 @@ define <2 x i64> @stack_fold_extracti64x
 
 define <8 x i32> @stack_fold_extracti32x8(<16 x i32> %a0, <16 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_extracti32x8
-  ;CHECK:       vextracti32x8 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
+  ;CHECK:       vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
   ; add forces execution domain
   %1 = add <16 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
   %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -160,7 +160,7 @@ define <4 x i64> @stack_fold_extracti64x
 
 define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) {
   ;CHECK-LABEL: stack_fold_inserti32x8
-  ;CHECK:       vinserti32x8 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  ;CHECK:       vinserti64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ; add forces execution domain

Modified: llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll Thu Aug 17 08:40:25 2017
@@ -1457,26 +1457,12 @@ define <16 x float> @reg_broadcast_4f32_
 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
 ; X32-AVX-NEXT:    retl
 ;
-; X32-AVX512F-LABEL: reg_broadcast_4f32_16f32:
-; X32-AVX512F:       # BB#0:
-; X32-AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512F-NEXT:    retl
-;
-; X32-AVX512BW-LABEL: reg_broadcast_4f32_16f32:
-; X32-AVX512BW:       # BB#0:
-; X32-AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512BW-NEXT:    retl
-;
-; X32-AVX512DQ-LABEL: reg_broadcast_4f32_16f32:
-; X32-AVX512DQ:       # BB#0:
-; X32-AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512DQ-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512DQ-NEXT:    retl
+; X32-AVX512-LABEL: reg_broadcast_4f32_16f32:
+; X32-AVX512:       # BB#0:
+; X32-AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT:    retl
 ;
 ; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
 ; X64-AVX:       # BB#0:
@@ -1485,26 +1471,12 @@ define <16 x float> @reg_broadcast_4f32_
 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
 ; X64-AVX-NEXT:    retq
 ;
-; X64-AVX512F-LABEL: reg_broadcast_4f32_16f32:
-; X64-AVX512F:       # BB#0:
-; X64-AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512F-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: reg_broadcast_4f32_16f32:
-; X64-AVX512BW:       # BB#0:
-; X64-AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512DQ-LABEL: reg_broadcast_4f32_16f32:
-; X64-AVX512DQ:       # BB#0:
-; X64-AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512DQ-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512DQ-NEXT:    retq
+; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
+; X64-AVX512:       # BB#0:
+; X64-AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT:    retq
  %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  ret <16 x float> %1
 }
@@ -1515,46 +1487,22 @@ define <16 x float> @reg_broadcast_8f32_
 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
 ; X32-AVX-NEXT:    retl
 ;
-; X32-AVX512F-LABEL: reg_broadcast_8f32_16f32:
-; X32-AVX512F:       # BB#0:
-; X32-AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X32-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512F-NEXT:    retl
-;
-; X32-AVX512BW-LABEL: reg_broadcast_8f32_16f32:
-; X32-AVX512BW:       # BB#0:
-; X32-AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512BW-NEXT:    retl
-;
-; X32-AVX512DQ-LABEL: reg_broadcast_8f32_16f32:
-; X32-AVX512DQ:       # BB#0:
-; X32-AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X32-AVX512DQ-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512DQ-NEXT:    retl
+; X32-AVX512-LABEL: reg_broadcast_8f32_16f32:
+; X32-AVX512:       # BB#0:
+; X32-AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT:    retl
 ;
 ; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
 ; X64-AVX:       # BB#0:
 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
 ; X64-AVX-NEXT:    retq
 ;
-; X64-AVX512F-LABEL: reg_broadcast_8f32_16f32:
-; X64-AVX512F:       # BB#0:
-; X64-AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X64-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512F-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: reg_broadcast_8f32_16f32:
-; X64-AVX512BW:       # BB#0:
-; X64-AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512DQ-LABEL: reg_broadcast_8f32_16f32:
-; X64-AVX512DQ:       # BB#0:
-; X64-AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X64-AVX512DQ-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512DQ-NEXT:    retq
+; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
+; X64-AVX512:       # BB#0:
+; X64-AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT:    retq
  %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  ret <16 x float> %1
 }
@@ -1583,26 +1531,12 @@ define <16 x i32> @reg_broadcast_4i32_16
 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
 ; X32-AVX-NEXT:    retl
 ;
-; X32-AVX512F-LABEL: reg_broadcast_4i32_16i32:
-; X32-AVX512F:       # BB#0:
-; X32-AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512F-NEXT:    retl
-;
-; X32-AVX512BW-LABEL: reg_broadcast_4i32_16i32:
-; X32-AVX512BW:       # BB#0:
-; X32-AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512BW-NEXT:    retl
-;
-; X32-AVX512DQ-LABEL: reg_broadcast_4i32_16i32:
-; X32-AVX512DQ:       # BB#0:
-; X32-AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X32-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X32-AVX512DQ-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512DQ-NEXT:    retl
+; X32-AVX512-LABEL: reg_broadcast_4i32_16i32:
+; X32-AVX512:       # BB#0:
+; X32-AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X32-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT:    retl
 ;
 ; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
 ; X64-AVX:       # BB#0:
@@ -1611,26 +1545,12 @@ define <16 x i32> @reg_broadcast_4i32_16
 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
 ; X64-AVX-NEXT:    retq
 ;
-; X64-AVX512F-LABEL: reg_broadcast_4i32_16i32:
-; X64-AVX512F:       # BB#0:
-; X64-AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512F-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: reg_broadcast_4i32_16i32:
-; X64-AVX512BW:       # BB#0:
-; X64-AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512DQ-LABEL: reg_broadcast_4i32_16i32:
-; X64-AVX512DQ:       # BB#0:
-; X64-AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; X64-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; X64-AVX512DQ-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512DQ-NEXT:    retq
+; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
+; X64-AVX512:       # BB#0:
+; X64-AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT:    retq
  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  ret <16 x i32> %1
 }
@@ -1641,46 +1561,22 @@ define <16 x i32> @reg_broadcast_8i32_16
 ; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
 ; X32-AVX-NEXT:    retl
 ;
-; X32-AVX512F-LABEL: reg_broadcast_8i32_16i32:
-; X32-AVX512F:       # BB#0:
-; X32-AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X32-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512F-NEXT:    retl
-;
-; X32-AVX512BW-LABEL: reg_broadcast_8i32_16i32:
-; X32-AVX512BW:       # BB#0:
-; X32-AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512BW-NEXT:    retl
-;
-; X32-AVX512DQ-LABEL: reg_broadcast_8i32_16i32:
-; X32-AVX512DQ:       # BB#0:
-; X32-AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X32-AVX512DQ-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
-; X32-AVX512DQ-NEXT:    retl
+; X32-AVX512-LABEL: reg_broadcast_8i32_16i32:
+; X32-AVX512:       # BB#0:
+; X32-AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT:    retl
 ;
 ; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
 ; X64-AVX:       # BB#0:
 ; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
 ; X64-AVX-NEXT:    retq
 ;
-; X64-AVX512F-LABEL: reg_broadcast_8i32_16i32:
-; X64-AVX512F:       # BB#0:
-; X64-AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X64-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512F-NEXT:    retq
-;
-; X64-AVX512BW-LABEL: reg_broadcast_8i32_16i32:
-; X64-AVX512BW:       # BB#0:
-; X64-AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512BW-NEXT:    retq
-;
-; X64-AVX512DQ-LABEL: reg_broadcast_8i32_16i32:
-; X64-AVX512DQ:       # BB#0:
-; X64-AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; X64-AVX512DQ-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
-; X64-AVX512DQ-NEXT:    retq
+; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
+; X64-AVX512:       # BB#0:
+; X64-AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT:    retq
  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  ret <16 x i32> %1
 }

Modified: llvm/trunk/test/CodeGen/X86/vector-compare-results.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-compare-results.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-compare-results.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-compare-results.ll Thu Aug 17 08:40:25 2017
@@ -2073,353 +2073,121 @@ define <16 x i1> @test_cmp_v16f64(<16 x
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: test_cmp_v16f64:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
-; AVX512F-NEXT:    vextractf32x4 $3, %zmm0, %xmm5
-; AVX512F-NEXT:    xorl %eax, %eax
-; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512F-NEXT:    movq $-1, %rcx
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm6
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512F-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
-; AVX512F-NEXT:    vextractf32x4 $2, %zmm0, %xmm6
-; AVX512F-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm7
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512F-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512F-NEXT:    vextractf32x4 $1, %zmm2, %xmm5
-; AVX512F-NEXT:    vextractf32x4 $1, %zmm0, %xmm6
-; AVX512F-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm7
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512F-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512F-NEXT:    vucomisd %xmm2, %xmm0
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm6
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT:    vucomisd %xmm2, %xmm0
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
-; AVX512F-NEXT:    vextractf32x4 $3, %zmm1, %xmm4
-; AVX512F-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm5
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512F-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
-; AVX512F-NEXT:    vextractf32x4 $2, %zmm1, %xmm5
-; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm6
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vextractf32x4 $1, %zmm3, %xmm4
-; AVX512F-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
-; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm6
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512F-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512F-NEXT:    vucomisd %xmm3, %xmm1
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovaq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm5
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT:    vucomisd %xmm3, %xmm1
-; AVX512F-NEXT:    cmovaq %rcx, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512DQ-LABEL: test_cmp_v16f64:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm2, %xmm4
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm0, %xmm5
-; AVX512DQ-NEXT:    xorl %eax, %eax
-; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT:    movq $-1, %rcx
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm2, %xmm5
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm0, %xmm6
-; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm7
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm2, %xmm5
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm0, %xmm6
-; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm7
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512DQ-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm0
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm0
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm3, %xmm2
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm1, %xmm4
-; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm2
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm3, %xmm4
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm1, %xmm5
-; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm3, %xmm4
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm1, %xmm5
-; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512DQ-NEXT:    vucomisd %xmm3, %xmm1
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512DQ-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512DQ-NEXT:    vucomisd %xmm3, %xmm1
-; AVX512DQ-NEXT:    cmovaq %rcx, %rax
-; AVX512DQ-NEXT:    vmovq %rax, %xmm1
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: test_cmp_v16f64:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
-; AVX512BW-NEXT:    vextractf32x4 $3, %zmm0, %xmm5
-; AVX512BW-NEXT:    xorl %eax, %eax
-; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT:    movq $-1, %rcx
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm6
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512BW-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
-; AVX512BW-NEXT:    vextractf32x4 $2, %zmm0, %xmm6
-; AVX512BW-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm7
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512BW-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512BW-NEXT:    vextractf32x4 $1, %zmm2, %xmm5
-; AVX512BW-NEXT:    vextractf32x4 $1, %zmm0, %xmm6
-; AVX512BW-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm7
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
-; AVX512BW-NEXT:    vucomisd %xmm5, %xmm6
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
-; AVX512BW-NEXT:    vucomisd %xmm2, %xmm0
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm6
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512BW-NEXT:    vucomisd %xmm2, %xmm0
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
-; AVX512BW-NEXT:    vextractf32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm5
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT:    vucomisd %xmm2, %xmm4
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
-; AVX512BW-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
-; AVX512BW-NEXT:    vextractf32x4 $2, %zmm1, %xmm5
-; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm6
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512BW-NEXT:    vextractf32x4 $1, %zmm3, %xmm4
-; AVX512BW-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
-; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm6
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; AVX512BW-NEXT:    vucomisd %xmm4, %xmm5
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
-; AVX512BW-NEXT:    vucomisd %xmm3, %xmm1
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovaq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm5
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512BW-NEXT:    vucomisd %xmm3, %xmm1
-; AVX512BW-NEXT:    cmovaq %rcx, %rax
-; AVX512BW-NEXT:    vmovq %rax, %xmm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512-LABEL: test_cmp_v16f64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
+; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm5
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512-NEXT:    movq $-1, %rcx
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm6
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
+; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm6
+; AVX512-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm7
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm5
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512-NEXT:    vextractf32x4 $1, %zmm2, %xmm5
+; AVX512-NEXT:    vextractf32x4 $1, %zmm0, %xmm6
+; AVX512-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm7
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm6[1,0]
+; AVX512-NEXT:    vucomisd %xmm5, %xmm6
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm5
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm7[0],xmm5[0]
+; AVX512-NEXT:    vucomisd %xmm2, %xmm0
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm6
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    vucomisd %xmm2, %xmm0
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm6[0],xmm0[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
+; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm4
+; AVX512-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm5
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT:    vucomisd %xmm2, %xmm4
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
+; AVX512-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
+; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm5
+; AVX512-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm6
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512-NEXT:    vextractf32x4 $1, %zmm3, %xmm4
+; AVX512-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
+; AVX512-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm6
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm4[1,0]
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
+; AVX512-NEXT:    vucomisd %xmm4, %xmm5
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
+; AVX512-NEXT:    vucomisd %xmm3, %xmm1
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovaq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm5
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT:    vucomisd %xmm3, %xmm1
+; AVX512-NEXT:    cmovaq %rcx, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = fcmp ogt <16 x double> %a0, %a1
   ret <16 x i1> %1
 }
@@ -3060,7 +2828,7 @@ define <32 x i1> @test_cmp_v32f32(<32 x
 ; AVX512DQ-NEXT:    cmoval %ecx, %edx
 ; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm5, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm8, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm8
 ; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
 ; AVX512DQ-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
@@ -3157,7 +2925,7 @@ define <32 x i1> @test_cmp_v32f32(<32 x
 ; AVX512DQ-NEXT:    cmoval %ecx, %eax
 ; AVX512DQ-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
 ; AVX512DQ-NEXT:    retq
@@ -3295,638 +3063,374 @@ define <32 x i1> @test_cmp_v32f32(<32 x
 ; AVX512BW-NEXT:    vucomiss %xmm5, %xmm7
 ; AVX512BW-NEXT:    movl $0, %edx
 ; AVX512BW-NEXT:    cmoval %ecx, %edx
-; AVX512BW-NEXT:    vucomiss %xmm4, %xmm6
-; AVX512BW-NEXT:    movl $0, %esi
-; AVX512BW-NEXT:    cmoval %ecx, %esi
-; AVX512BW-NEXT:    vmovd %esi, %xmm5
-; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
-; AVX512BW-NEXT:    vucomiss %xmm7, %xmm0
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmoval %ecx, %edx
-; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm0
-; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
-; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
-; AVX512BW-NEXT:    vucomiss %xmm4, %xmm5
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmoval %ecx, %edx
-; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT:    vextractf32x4 $1, %zmm3, %xmm0
-; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; AVX512BW-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
-; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; AVX512BW-NEXT:    vucomiss %xmm4, %xmm6
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmoval %ecx, %edx
-; AVX512BW-NEXT:    vucomiss %xmm0, %xmm5
-; AVX512BW-NEXT:    movl $0, %esi
-; AVX512BW-NEXT:    cmoval %ecx, %esi
-; AVX512BW-NEXT:    vmovd %esi, %xmm4
-; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
-; AVX512BW-NEXT:    vucomiss %xmm6, %xmm7
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmoval %ecx, %edx
-; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; AVX512BW-NEXT:    vucomiss %xmm0, %xmm5
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmoval %ecx, %edx
-; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm0
-; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; AVX512BW-NEXT:    vucomiss %xmm4, %xmm5
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmoval %ecx, %edx
-; AVX512BW-NEXT:    vucomiss %xmm3, %xmm1
-; AVX512BW-NEXT:    movl $0, %esi
-; AVX512BW-NEXT:    cmoval %ecx, %esi
-; AVX512BW-NEXT:    vmovd %esi, %xmm4
-; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
-; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
-; AVX512BW-NEXT:    vucomiss %xmm5, %xmm6
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmoval %ecx, %edx
-; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
-; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512BW-NEXT:    vucomiss %xmm3, %xmm1
-; AVX512BW-NEXT:    cmoval %ecx, %eax
-; AVX512BW-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm1
-; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    retq
-  %1 = fcmp ogt <32 x float> %a0, %a1
-  ret <32 x i1> %1
-}
-
-define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
-; SSE2-LABEL: test_cmp_v16i64:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
-; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm9
-; SSE2-NEXT:    movdqa %xmm7, %xmm10
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm7, %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3]
-; SSE2-NEXT:    pand %xmm11, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
-; SSE2-NEXT:    por %xmm7, %xmm9
-; SSE2-NEXT:    pxor %xmm8, %xmm6
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    movdqa %xmm6, %xmm10
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm10
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pand %xmm11, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
-; SSE2-NEXT:    por %xmm7, %xmm10
-; SSE2-NEXT:    packsswb %xmm9, %xmm10
-; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm7
-; SSE2-NEXT:    pxor %xmm8, %xmm7
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pand %xmm9, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm9, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    packsswb %xmm6, %xmm4
-; SSE2-NEXT:    packsswb %xmm10, %xmm4
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm5
-; SSE2-NEXT:    pxor %xmm8, %xmm2
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    packsswb %xmm5, %xmm2
-; SSE2-NEXT:    pxor %xmm8, %xmm1
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
-; SSE2-NEXT:    pxor %xmm8, %xmm3
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm1, %xmm3
-; SSE2-NEXT:    pxor %xmm8, %xmm0
-; SSE2-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
-; SSE2-NEXT:    pand %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    packsswb %xmm3, %xmm0
-; SSE2-NEXT:    packsswb %xmm2, %xmm0
-; SSE2-NEXT:    packsswb %xmm4, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE42-LABEL: test_cmp_v16i64:
-; SSE42:       # BB#0:
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    packsswb %xmm7, %xmm6
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT:    packsswb %xmm5, %xmm4
-; SSE42-NEXT:    packsswb %xmm6, %xmm4
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm3
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT:    packsswb %xmm3, %xmm2
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm1
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    packsswb %xmm1, %xmm0
-; SSE42-NEXT:    packsswb %xmm2, %xmm0
-; SSE42-NEXT:    packsswb %xmm4, %xmm0
-; SSE42-NEXT:    retq
-;
-; AVX1-LABEL: test_cmp_v16i64:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm9
-; AVX1-NEXT:    vpcmpgtq %xmm8, %xmm9, %xmm8
-; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpacksswb %xmm8, %xmm3, %xmm8
-; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm2, %xmm2
-; AVX1-NEXT:    vpacksswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpacksswb %xmm8, %xmm2, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vpacksswb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_cmp_v16i64:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vpcmpgtq %ymm7, %ymm3, %ymm3
-; AVX2-NEXT:    vpcmpgtq %ymm6, %ymm2, %ymm2
-; AVX2-NEXT:    vpacksswb %ymm3, %ymm2, %ymm2
-; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
-; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm1, %ymm1
-; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm0, %ymm0
-; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vpacksswb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: test_cmp_v16i64:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
-; AVX512F-NEXT:    vpextrq $1, %xmm4, %rcx
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm5
-; AVX512F-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512F-NEXT:    xorl %eax, %eax
-; AVX512F-NEXT:    cmpq %rcx, %rdx
-; AVX512F-NEXT:    movq $-1, %rcx
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm6
-; AVX512F-NEXT:    vmovq %xmm4, %rdx
-; AVX512F-NEXT:    vmovq %xmm5, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
-; AVX512F-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm6
-; AVX512F-NEXT:    vpextrq $1, %xmm6, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm7
-; AVX512F-NEXT:    vmovq %xmm5, %rdx
-; AVX512F-NEXT:    vmovq %xmm6, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512F-NEXT:    vextracti32x4 $1, %zmm2, %xmm5
-; AVX512F-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512F-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
-; AVX512F-NEXT:    vpextrq $1, %xmm6, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm7
-; AVX512F-NEXT:    vmovq %xmm5, %rdx
-; AVX512F-NEXT:    vmovq %xmm6, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm5
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512F-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm6
-; AVX512F-NEXT:    vmovq %xmm2, %rdx
-; AVX512F-NEXT:    vmovq %xmm0, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm0
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
-; AVX512F-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; AVX512F-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm5
-; AVX512F-NEXT:    vmovq %xmm2, %rdx
-; AVX512F-NEXT:    vmovq %xmm4, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm2
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
-; AVX512F-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; AVX512F-NEXT:    vpextrq $1, %xmm5, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm6
-; AVX512F-NEXT:    vmovq %xmm4, %rdx
-; AVX512F-NEXT:    vmovq %xmm5, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512F-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
-; AVX512F-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512F-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
-; AVX512F-NEXT:    vpextrq $1, %xmm5, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm6
-; AVX512F-NEXT:    vmovq %xmm4, %rdx
-; AVX512F-NEXT:    vmovq %xmm5, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm4
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512F-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    cmovgq %rcx, %rdx
-; AVX512F-NEXT:    vmovq %rdx, %xmm5
-; AVX512F-NEXT:    vmovq %xmm3, %rdx
-; AVX512F-NEXT:    vmovq %xmm1, %rsi
-; AVX512F-NEXT:    cmpq %rdx, %rsi
-; AVX512F-NEXT:    cmovgq %rcx, %rax
-; AVX512F-NEXT:    vmovq %rax, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512DQ-LABEL: test_cmp_v16i64:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm2, %xmm4
-; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rcx
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm0, %xmm5
-; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512DQ-NEXT:    xorl %eax, %eax
-; AVX512DQ-NEXT:    cmpq %rcx, %rdx
-; AVX512DQ-NEXT:    movq $-1, %rcx
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
-; AVX512DQ-NEXT:    vmovq %xmm4, %rdx
-; AVX512DQ-NEXT:    vmovq %xmm5, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm2, %xmm5
-; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm0, %xmm6
-; AVX512DQ-NEXT:    vpextrq $1, %xmm6, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm7
-; AVX512DQ-NEXT:    vmovq %xmm5, %rdx
-; AVX512DQ-NEXT:    vmovq %xmm6, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm2, %xmm5
-; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm0, %xmm6
-; AVX512DQ-NEXT:    vpextrq $1, %xmm6, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm7
-; AVX512DQ-NEXT:    vmovq %xmm5, %rdx
-; AVX512DQ-NEXT:    vmovq %xmm6, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512DQ-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
-; AVX512DQ-NEXT:    vmovq %xmm2, %rdx
-; AVX512DQ-NEXT:    vmovq %xmm0, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm3, %xmm2
-; AVX512DQ-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm1, %xmm4
-; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
-; AVX512DQ-NEXT:    vmovq %xmm2, %rdx
-; AVX512DQ-NEXT:    vmovq %xmm4, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm2
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm3, %xmm4
-; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm1, %xmm5
-; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
-; AVX512DQ-NEXT:    vmovq %xmm4, %rdx
-; AVX512DQ-NEXT:    vmovq %xmm5, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm3, %xmm4
-; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm1, %xmm5
-; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm6
-; AVX512DQ-NEXT:    vmovq %xmm4, %rdx
-; AVX512DQ-NEXT:    vmovq %xmm5, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512DQ-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    movl $0, %edx
-; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
-; AVX512DQ-NEXT:    vmovq %rdx, %xmm5
-; AVX512DQ-NEXT:    vmovq %xmm3, %rdx
-; AVX512DQ-NEXT:    vmovq %xmm1, %rsi
-; AVX512DQ-NEXT:    cmpq %rdx, %rsi
-; AVX512DQ-NEXT:    cmovgq %rcx, %rax
-; AVX512DQ-NEXT:    vmovq %rax, %xmm1
-; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: test_cmp_v16i64:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
-; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rcx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm5
-; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512BW-NEXT:    xorl %eax, %eax
-; AVX512BW-NEXT:    cmpq %rcx, %rdx
-; AVX512BW-NEXT:    movq $-1, %rcx
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm6
-; AVX512BW-NEXT:    vmovq %xmm4, %rdx
-; AVX512BW-NEXT:    vmovq %xmm5, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
-; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm6
-; AVX512BW-NEXT:    vpextrq $1, %xmm6, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm7
-; AVX512BW-NEXT:    vmovq %xmm5, %rdx
-; AVX512BW-NEXT:    vmovq %xmm6, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm2, %xmm5
-; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rdx
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
-; AVX512BW-NEXT:    vpextrq $1, %xmm6, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm7
-; AVX512BW-NEXT:    vmovq %xmm5, %rdx
-; AVX512BW-NEXT:    vmovq %xmm6, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm5
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm6
-; AVX512BW-NEXT:    vmovq %xmm2, %rdx
-; AVX512BW-NEXT:    vmovq %xmm0, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
-; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm0
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    vucomiss %xmm4, %xmm6
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmoval %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm5
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm5, %xmm5
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm4[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm6[1,0]
+; AVX512BW-NEXT:    vucomiss %xmm7, %xmm0
 ; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm5
-; AVX512BW-NEXT:    vmovq %xmm2, %rdx
-; AVX512BW-NEXT:    vmovq %xmm4, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm5, %xmm0
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm4 = xmm4[3,1,2,3]
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm5 = xmm6[3,1,2,3]
+; AVX512BW-NEXT:    vucomiss %xmm4, %xmm5
 ; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm2
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
-; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm3, %xmm0
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; AVX512BW-NEXT:    vextractf32x4 $1, %zmm1, %xmm5
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; AVX512BW-NEXT:    vucomiss %xmm4, %xmm6
 ; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm6
-; AVX512BW-NEXT:    vmovq %xmm4, %rdx
-; AVX512BW-NEXT:    vmovq %xmm5, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vucomiss %xmm0, %xmm5
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmoval %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm4
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm0[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm7 = xmm5[1,0]
+; AVX512BW-NEXT:    vucomiss %xmm6, %xmm7
 ; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
-; AVX512BW-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512BW-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
-; AVX512BW-NEXT:    vpextrq $1, %xmm5, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; AVX512BW-NEXT:    vucomiss %xmm0, %xmm5
 ; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm6
-; AVX512BW-NEXT:    vmovq %xmm4, %rdx
-; AVX512BW-NEXT:    vmovq %xmm5, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $3, %edx, %xmm4, %xmm0
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; AVX512BW-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX512BW-NEXT:    vucomiss %xmm4, %xmm5
 ; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm4
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rdx
-; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vucomiss %xmm3, %xmm1
+; AVX512BW-NEXT:    movl $0, %esi
+; AVX512BW-NEXT:    cmoval %ecx, %esi
+; AVX512BW-NEXT:    vmovd %esi, %xmm4
+; AVX512BW-NEXT:    vpinsrd $1, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm3[1,0]
+; AVX512BW-NEXT:    vpermilpd {{.*#+}} xmm6 = xmm1[1,0]
+; AVX512BW-NEXT:    vucomiss %xmm5, %xmm6
 ; AVX512BW-NEXT:    movl $0, %edx
-; AVX512BW-NEXT:    cmovgq %rcx, %rdx
-; AVX512BW-NEXT:    vmovq %rdx, %xmm5
-; AVX512BW-NEXT:    vmovq %xmm3, %rdx
-; AVX512BW-NEXT:    vmovq %xmm1, %rsi
-; AVX512BW-NEXT:    cmpq %rdx, %rsi
-; AVX512BW-NEXT:    cmovgq %rcx, %rax
-; AVX512BW-NEXT:    vmovq %rax, %xmm1
-; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512BW-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    cmoval %ecx, %edx
+; AVX512BW-NEXT:    vpinsrd $2, %edx, %xmm4, %xmm4
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX512BW-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; AVX512BW-NEXT:    vucomiss %xmm3, %xmm1
+; AVX512BW-NEXT:    cmoval %ecx, %eax
+; AVX512BW-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    retq
+  %1 = fcmp ogt <32 x float> %a0, %a1
+  ret <32 x i1> %1
+}
+
+define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
+; SSE2-LABEL: test_cmp_v16i64:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [2147483648,0,2147483648,0]
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm9
+; SSE2-NEXT:    movdqa %xmm7, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm7, %xmm9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm9[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm9
+; SSE2-NEXT:    pxor %xmm8, %xmm6
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    movdqa %xmm6, %xmm10
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm11, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT:    por %xmm7, %xmm10
+; SSE2-NEXT:    packsswb %xmm9, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT:    pxor %xmm8, %xmm7
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm6
+; SSE2-NEXT:    pxor %xmm8, %xmm4
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm9, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    packsswb %xmm6, %xmm4
+; SSE2-NEXT:    packsswb %xmm10, %xmm4
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm5
+; SSE2-NEXT:    pxor %xmm8, %xmm2
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    packsswb %xmm5, %xmm2
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm0
+; SSE2-NEXT:    pxor {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm8, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm8[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    packsswb %xmm3, %xmm0
+; SSE2-NEXT:    packsswb %xmm2, %xmm0
+; SSE2-NEXT:    packsswb %xmm4, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: test_cmp_v16i64:
+; SSE42:       # BB#0:
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT:    packsswb %xmm7, %xmm6
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT:    packsswb %xmm5, %xmm4
+; SSE42-NEXT:    packsswb %xmm6, %xmm4
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT:    packsswb %xmm3, %xmm2
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT:    packsswb %xmm1, %xmm0
+; SSE42-NEXT:    packsswb %xmm2, %xmm0
+; SSE42-NEXT:    packsswb %xmm4, %xmm0
+; SSE42-NEXT:    retq
+;
+; AVX1-LABEL: test_cmp_v16i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm9
+; AVX1-NEXT:    vpcmpgtq %xmm8, %xmm9, %xmm8
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpacksswb %xmm8, %xmm3, %xmm8
+; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm3, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm2, %xmm2
+; AVX1-NEXT:    vpacksswb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpacksswb %xmm8, %xmm2, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpacksswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_cmp_v16i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpcmpgtq %ymm7, %ymm3, %ymm3
+; AVX2-NEXT:    vpcmpgtq %ymm6, %ymm2, %ymm2
+; AVX2-NEXT:    vpacksswb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX2-NEXT:    vpcmpgtq %ymm5, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpacksswb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: test_cmp_v16i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
+; AVX512-NEXT:    vpextrq $1, %xmm4, %rcx
+; AVX512-NEXT:    vextracti32x4 $3, %zmm0, %xmm5
+; AVX512-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    cmpq %rcx, %rdx
+; AVX512-NEXT:    movq $-1, %rcx
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm6
+; AVX512-NEXT:    vmovq %xmm4, %rdx
+; AVX512-NEXT:    vmovq %xmm5, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
+; AVX512-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512-NEXT:    vextracti32x4 $2, %zmm0, %xmm6
+; AVX512-NEXT:    vpextrq $1, %xmm6, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm7
+; AVX512-NEXT:    vmovq %xmm5, %rdx
+; AVX512-NEXT:    vmovq %xmm6, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm5
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4
+; AVX512-NEXT:    vextracti32x4 $1, %zmm2, %xmm5
+; AVX512-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
+; AVX512-NEXT:    vpextrq $1, %xmm6, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm7
+; AVX512-NEXT:    vmovq %xmm5, %rdx
+; AVX512-NEXT:    vmovq %xmm6, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm5
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm6
+; AVX512-NEXT:    vmovq %xmm2, %rdx
+; AVX512-NEXT:    vmovq %xmm0, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm0
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; AVX512-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm5
+; AVX512-NEXT:    vmovq %xmm2, %rdx
+; AVX512-NEXT:    vmovq %xmm4, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm2
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; AVX512-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
+; AVX512-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; AVX512-NEXT:    vpextrq $1, %xmm5, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm6
+; AVX512-NEXT:    vmovq %xmm4, %rdx
+; AVX512-NEXT:    vmovq %xmm5, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
+; AVX512-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
+; AVX512-NEXT:    vpextrq $1, %xmm5, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm6
+; AVX512-NEXT:    vmovq %xmm4, %rdx
+; AVX512-NEXT:    vmovq %xmm5, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm4
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movl $0, %edx
+; AVX512-NEXT:    cmovgq %rcx, %rdx
+; AVX512-NEXT:    vmovq %rdx, %xmm5
+; AVX512-NEXT:    vmovq %xmm3, %rdx
+; AVX512-NEXT:    vmovq %xmm1, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    cmovgq %rcx, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = icmp sgt <16 x i64> %a0, %a1
   ret <16 x i1> %1
 }
@@ -4583,7 +4087,7 @@ define <32 x i1> @test_cmp_v32i32(<32 x
 ; AVX512DQ-NEXT:    cmovgl %ecx, %edx
 ; AVX512DQ-NEXT:    vpinsrd $3, %edx, %xmm6, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
 ; AVX512DQ-NEXT:    vpextrd $1, %xmm2, %edx
@@ -4688,7 +4192,7 @@ define <32 x i1> @test_cmp_v32i32(<32 x
 ; AVX512DQ-NEXT:    cmovgl %ecx, %eax
 ; AVX512DQ-NEXT:    vpinsrd $3, %eax, %xmm5, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    retq
@@ -9176,8 +8680,8 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ;
 ; AVX512DQ-LABEL: test_cmp_v32f64:
 ; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm4, %xmm8
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm0, %xmm9
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm4, %xmm8
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm0, %xmm9
 ; AVX512DQ-NEXT:    xorl %eax, %eax
 ; AVX512DQ-NEXT:    vucomisd %xmm8, %xmm9
 ; AVX512DQ-NEXT:    movq $-1, %rcx
@@ -9191,8 +8695,8 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm8
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0]
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm4, %xmm9
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm0, %xmm10
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm0, %xmm10
 ; AVX512DQ-NEXT:    vucomisd %xmm9, %xmm10
 ; AVX512DQ-NEXT:    movl $0, %edx
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
@@ -9205,8 +8709,8 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm9
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm11[0],xmm9[0]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm4, %xmm9
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm0, %xmm10
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm0, %xmm10
 ; AVX512DQ-NEXT:    vucomisd %xmm9, %xmm10
 ; AVX512DQ-NEXT:    movl $0, %edx
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
@@ -9232,8 +8736,8 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm8
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm5, %xmm4
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm1, %xmm0
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm5, %xmm4
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm1, %xmm0
 ; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm0
 ; AVX512DQ-NEXT:    movl $0, %edx
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
@@ -9245,8 +8749,8 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm0[0]
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm5, %xmm4
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm1, %xmm0
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm5, %xmm4
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm1, %xmm0
 ; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm0
 ; AVX512DQ-NEXT:    movl $0, %edx
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
@@ -9259,8 +8763,8 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm10[0],xmm0[0]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm5, %xmm4
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm1, %xmm0
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm5, %xmm4
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm1, %xmm0
 ; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm0
 ; AVX512DQ-NEXT:    movl $0, %edx
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
@@ -9286,10 +8790,10 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm8, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm8
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm6, %xmm1
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm6, %xmm1
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm2, %xmm4
 ; AVX512DQ-NEXT:    vucomisd %xmm1, %xmm4
 ; AVX512DQ-NEXT:    movl $0, %edx
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
@@ -9301,8 +8805,8 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm1
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm6, %xmm4
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm6, %xmm4
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm2, %xmm5
 ; AVX512DQ-NEXT:    vucomisd %xmm4, %xmm5
 ; AVX512DQ-NEXT:    movl $0, %edx
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
@@ -9315,8 +8819,8 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm6, %xmm1
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm6, %xmm1
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm2, %xmm4
 ; AVX512DQ-NEXT:    vucomisd %xmm1, %xmm4
 ; AVX512DQ-NEXT:    movl $0, %edx
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
@@ -9342,8 +8846,8 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm7, %xmm1
-; AVX512DQ-NEXT:    vextractf64x2 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm7, %xmm1
+; AVX512DQ-NEXT:    vextractf32x4 $3, %zmm3, %xmm2
 ; AVX512DQ-NEXT:    vucomisd %xmm1, %xmm2
 ; AVX512DQ-NEXT:    movl $0, %edx
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
@@ -9355,8 +8859,8 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm1
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm7, %xmm2
-; AVX512DQ-NEXT:    vextractf64x2 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm7, %xmm2
+; AVX512DQ-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
 ; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm4
 ; AVX512DQ-NEXT:    movl $0, %edx
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
@@ -9369,8 +8873,8 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm5[0],xmm2[0]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm7, %xmm2
-; AVX512DQ-NEXT:    vextractf64x2 $1, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm7, %xmm2
+; AVX512DQ-NEXT:    vextractf32x4 $1, %zmm3, %xmm4
 ; AVX512DQ-NEXT:    vucomisd %xmm2, %xmm4
 ; AVX512DQ-NEXT:    movl $0, %edx
 ; AVX512DQ-NEXT:    cmovaq %rcx, %rdx
@@ -9395,7 +8899,7 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
 ; AVX512DQ-NEXT:    retq
@@ -10512,9 +10016,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ;
 ; AVX512DQ-LABEL: test_cmp_v32i64:
 ; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm4, %xmm8
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm4, %xmm8
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm8, %rcx
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm0, %xmm9
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm9
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm9, %rdx
 ; AVX512DQ-NEXT:    xorl %eax, %eax
 ; AVX512DQ-NEXT:    cmpq %rcx, %rdx
@@ -10529,9 +10033,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm8
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm9, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm0, %xmm10
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm10
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm10, %rsi
 ; AVX512DQ-NEXT:    cmpq %rdx, %rsi
 ; AVX512DQ-NEXT:    movl $0, %edx
@@ -10545,9 +10049,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm9
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm4, %xmm9
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm9, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm0, %xmm10
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm0, %xmm10
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm10, %rsi
 ; AVX512DQ-NEXT:    cmpq %rdx, %rsi
 ; AVX512DQ-NEXT:    movl $0, %edx
@@ -10576,9 +10080,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm8
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm5, %xmm4
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm5, %xmm4
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm1, %xmm0
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rsi
 ; AVX512DQ-NEXT:    cmpq %rdx, %rsi
 ; AVX512DQ-NEXT:    movl $0, %edx
@@ -10591,9 +10095,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm9 = xmm0[0],xmm9[0]
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm5, %xmm4
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm5, %xmm4
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm1, %xmm0
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm0
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rsi
 ; AVX512DQ-NEXT:    cmpq %rdx, %rsi
 ; AVX512DQ-NEXT:    movl $0, %edx
@@ -10607,9 +10111,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm5, %xmm0
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm5, %xmm0
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm1, %xmm4
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm1, %xmm4
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
 ; AVX512DQ-NEXT:    cmpq %rdx, %rsi
 ; AVX512DQ-NEXT:    movl $0, %edx
@@ -10638,11 +10142,11 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm8, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm8
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm6, %xmm1
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm6, %xmm1
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm1, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm2, %xmm4
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
 ; AVX512DQ-NEXT:    cmpq %rdx, %rsi
 ; AVX512DQ-NEXT:    movl $0, %edx
@@ -10655,9 +10159,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm1
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm6, %xmm4
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm6, %xmm4
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm2, %xmm5
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm2, %xmm5
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm5, %rsi
 ; AVX512DQ-NEXT:    cmpq %rdx, %rsi
 ; AVX512DQ-NEXT:    movl $0, %edx
@@ -10671,9 +10175,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm4
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm4[0],xmm0[0]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm6, %xmm0
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm6, %xmm0
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm2, %xmm4
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm2, %xmm4
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
 ; AVX512DQ-NEXT:    cmpq %rdx, %rsi
 ; AVX512DQ-NEXT:    movl $0, %edx
@@ -10702,9 +10206,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm1
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm7, %xmm0
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm7, %xmm0
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $3, %zmm3, %xmm2
+; AVX512DQ-NEXT:    vextracti32x4 $3, %zmm3, %xmm2
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm2, %rsi
 ; AVX512DQ-NEXT:    cmpq %rdx, %rsi
 ; AVX512DQ-NEXT:    movl $0, %edx
@@ -10717,9 +10221,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    cmovgq %rcx, %rdx
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm7, %xmm2
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm7, %xmm2
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm2, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $2, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm3, %xmm4
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
 ; AVX512DQ-NEXT:    cmpq %rdx, %rsi
 ; AVX512DQ-NEXT:    movl $0, %edx
@@ -10733,9 +10237,9 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    vmovq %rdx, %xmm2
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm7, %xmm0
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm7, %xmm0
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512DQ-NEXT:    vextracti64x2 $1, %zmm3, %xmm4
+; AVX512DQ-NEXT:    vextracti32x4 $1, %zmm3, %xmm4
 ; AVX512DQ-NEXT:    vpextrq $1, %xmm4, %rsi
 ; AVX512DQ-NEXT:    cmpq %rdx, %rsi
 ; AVX512DQ-NEXT:    movl $0, %edx
@@ -10763,7 +10267,7 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm3, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm8, %ymm0
 ; AVX512DQ-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll Thu Aug 17 08:40:25 2017
@@ -274,7 +274,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_
 define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
 ; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vextractf32x8 $1, %zmm0, %ymm1
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
 ; ALL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
 ; ALL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; ALL-NEXT:    retq
@@ -314,7 +314,7 @@ define <8 x float> @shuffle_v16f32_extra
 define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
 ; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vextractf32x8 $1, %zmm0, %ymm1
+; ALL-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
 ; ALL-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2]
 ; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
 ; ALL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,u]
@@ -684,7 +684,7 @@ define <16 x i32> @mask_shuffle_v4i32_v1
 ; ALL:       # BB#0:
 ; ALL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   ret <16 x i32> %res
@@ -695,7 +695,7 @@ define <16 x float> @mask_shuffle_v4f32_
 ; ALL:       # BB#0:
 ; ALL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; ALL-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
+; ALL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %res = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   ret <16 x float> %res

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll Thu Aug 17 08:40:25 2017
@@ -474,7 +474,7 @@ define <16 x float> @expand13(<8 x float
 ; SKX64-LABEL: expand13:
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; SKX64-NEXT:    vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
+; SKX64-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
 ; SKX64-NEXT:    retq
 ;
 ; KNL64-LABEL: expand13:
@@ -486,7 +486,7 @@ define <16 x float> @expand13(<8 x float
 ; SKX32-LABEL: expand13:
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; SKX32-NEXT:    vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
+; SKX32-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
 ; SKX32-NEXT:    retl
 ;
 ; KNL32-LABEL: expand13:

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll Thu Aug 17 08:40:25 2017
@@ -171,7 +171,7 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u
 ; VL_BW_DQ:       # BB#0:
 ; VL_BW_DQ-NEXT:    kmovd %edi, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT:    vextracti64x2 $1, %zmm0, %xmm0
+; VL_BW_DQ-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
 ; VL_BW_DQ-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0

Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll?rev=311091&r1=311090&r2=311091&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll Thu Aug 17 08:40:25 2017
@@ -257,38 +257,16 @@ define <16 x i8> @trunc_add_v16i64_v16i8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc_add_v16i64_v16i8:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_add_v16i64_v16i8:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: trunc_add_v16i64_v16i8:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512-LABEL: trunc_add_v16i64_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = add <16 x i64> %a0, %a1
   %2 = trunc <16 x i64> %1 to <16 x i8>
   ret <16 x i8> %2
@@ -683,35 +661,15 @@ define <16 x i8> @trunc_add_const_v16i64
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
   ret <16 x i8> %2
@@ -1084,38 +1042,16 @@ define <16 x i8> @trunc_sub_v16i64_v16i8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc_sub_v16i64_v16i8:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_sub_v16i64_v16i8:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: trunc_sub_v16i64_v16i8:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512-LABEL: trunc_sub_v16i64_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsubq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpsubq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = sub <16 x i64> %a0, %a1
   %2 = trunc <16 x i64> %1 to <16 x i8>
   ret <16 x i8> %2
@@ -1510,38 +1446,16 @@ define <16 x i8> @trunc_sub_const_v16i64
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vpsubq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512F-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpsubq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpsubq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
   ret <16 x i8> %2
@@ -2186,7 +2100,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8
 ; AVX512DQ-NEXT:    vpmullq %zmm2, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
@@ -2732,38 +2646,16 @@ define <16 x i8> @trunc_mul_const_v16i64
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmulld {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
   ret <16 x i8> %2
@@ -3157,38 +3049,16 @@ define <16 x i8> @trunc_and_v16i64_v16i8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc_and_v16i64_v16i8:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vpandq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_and_v16i64_v16i8:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpandq %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: trunc_and_v16i64_v16i8:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpandq %zmm3, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpandq %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512-LABEL: trunc_and_v16i64_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpandq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = and <16 x i64> %a0, %a1
   %2 = trunc <16 x i64> %1 to <16 x i8>
   ret <16 x i8> %2
@@ -3528,35 +3398,15 @@ define <16 x i8> @trunc_and_const_v16i64
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
   ret <16 x i8> %2
@@ -3913,38 +3763,16 @@ define <16 x i8> @trunc_xor_v16i64_v16i8
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc_xor_v16i64_v16i8:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_xor_v16i64_v16i8:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: trunc_xor_v16i64_v16i8:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512-LABEL: trunc_xor_v16i64_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpxorq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vpxorq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = xor <16 x i64> %a0, %a1
   %2 = trunc <16 x i64> %1 to <16 x i8>
   ret <16 x i8> %2
@@ -4284,35 +4112,15 @@ define <16 x i8> @trunc_xor_const_v16i64
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
   ret <16 x i8> %2
@@ -4669,38 +4477,16 @@ define <16 x i8> @trunc_or_v16i64_v16i8(
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc_or_v16i64_v16i8:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vporq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_or_v16i64_v16i8:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vporq %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT:    vporq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: trunc_or_v16i64_v16i8:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vporq %zmm3, %zmm1, %zmm1
-; AVX512DQ-NEXT:    vporq %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512-LABEL: trunc_or_v16i64_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vporq %zmm3, %zmm1, %zmm1
+; AVX512-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = or <16 x i64> %a0, %a1
   %2 = trunc <16 x i64> %1 to <16 x i8>
   ret <16 x i8> %2
@@ -5040,35 +4826,15 @@ define <16 x i8> @trunc_or_const_v16i64_
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
   ret <16 x i8> %2




More information about the llvm-commits mailing list